ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
PB_CpsyrkAC.c
Go to the documentation of this file.
1 /* ---------------------------------------------------------------------
2 *
3 * -- PBLAS auxiliary routine (version 2.0) --
4 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5 * and University of California, Berkeley.
6 * April 1, 1998
7 *
8 * ---------------------------------------------------------------------
9 */
10 /*
11 * Include files
12 */
13 #include "../pblas.h"
14 #include "../PBpblas.h"
15 #include "../PBtools.h"
16 #include "../PBblacs.h"
17 #include "../PBblas.h"
18 
19 #ifdef __STDC__
20 void PB_CpsyrkAC( PBTYP_T * TYPE, char * DIRECA, char * CONJUG,
21  char * UPLO, char * TRANS, int N, int K, char * ALPHA,
22  char * A, int IA, int JA, int * DESCA, char * BETA,
23  char * C, int IC, int JC, int * DESCC )
24 #else
25 void PB_CpsyrkAC( TYPE, DIRECA, CONJUG, UPLO, TRANS, N, K, ALPHA, A, IA,
26  JA, DESCA, BETA, C, IC, JC, DESCC )
27 /*
28 * .. Scalar Arguments ..
29 */
30  char * CONJUG, * DIRECA, * TRANS, * UPLO;
31  int IA, IC, JA, JC, K, N;
32  char * ALPHA, * BETA;
33  PBTYP_T * TYPE;
34 /*
35 * .. Array Arguments ..
36 */
37  int * DESCA, * DESCC;
38  char * A, * C;
39 #endif
40 {
41 /*
42 * Purpose
43 * =======
44 *
45 * PB_CpsyrkAC performs one of the following symmetric or Hermitian rank
46 * k operations
47 *
48 * sub( C ) := alpha*sub( A )*sub( A )' + beta*sub( C ),
49 * or
50 * sub( C ) := alpha*sub( A )*conjg( sub( A )' ) + beta*sub( C ),
51 * or
52 * sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ),
53 * or
54 * sub( C ) := alpha*conjg( sub( A )' )*sub( A ) + beta*sub( C ),
55 *
56 * where
57 *
58 * sub( C ) denotes C(IC:IC+N-1,JC:JC+N-1), and,
59 *
60 * sub( A ) denotes A(IA:IA+N-1,JA:JA+K-1) if TRANS = 'N',
61 * A(IA:IA+K-1,JA:JA+N-1) otherwise.
62 *
63 * Alpha and beta are scalars, sub( C ) is an n by n symmetric
64 * or Hermitian submatrix and sub( A ) is an n by k submatrix in the
65 * first case and a k by n submatrix in the second case.
66 *
67 * This is the outer-product algorithm using the logical aggregation
68 * blocking technique.
69 *
70 * Notes
71 * =====
72 *
73 * A description vector is associated with each 2D block-cyclicly dis-
74 * tributed matrix. This vector stores the information required to
75 * establish the mapping between a matrix entry and its corresponding
76 * process and memory location.
77 *
78 * In the following comments, the character _ should be read as
79 * "of the distributed matrix". Let A be a generic term for any 2D
80 * block cyclicly distributed matrix. Its description vector is DESC_A:
81 *
82 * NOTATION STORED IN EXPLANATION
83 * ---------------- --------------- ------------------------------------
84 * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
85 * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
86 * the NPROW x NPCOL BLACS process grid
87 * A is distributed over. The context
88 * itself is global, but the handle
89 * (the integer value) may vary.
90 * M_A (global) DESCA[ M_ ] The number of rows in the distribu-
91 * ted matrix A, M_A >= 0.
92 * N_A (global) DESCA[ N_ ] The number of columns in the distri-
93 * buted matrix A, N_A >= 0.
94 * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
95 * block of the matrix A, IMB_A > 0.
96 * INB_A (global) DESCA[ INB_ ] The number of columns of the upper
97 * left block of the matrix A,
98 * INB_A > 0.
99 * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
100 * bute the last M_A-IMB_A rows of A,
101 * MB_A > 0.
102 * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
103 * bute the last N_A-INB_A columns of
104 * A, NB_A > 0.
105 * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
106 * row of the matrix A is distributed,
107 * NPROW > RSRC_A >= 0.
108 * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
109 * first column of A is distributed.
110 * NPCOL > CSRC_A >= 0.
111 * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
112 * array storing the local blocks of
113 * the distributed matrix A,
114 * IF( Lc( 1, N_A ) > 0 )
115 * LLD_A >= MAX( 1, Lr( 1, M_A ) )
116 * ELSE
117 * LLD_A >= 1.
118 *
119 * Let K be the number of rows of a matrix A starting at the global in-
120 * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
121 * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
122 * receive if these K rows were distributed over NPROW processes. If K
123 * is the number of columns of a matrix A starting at the global index
124 * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
125 * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
126 * these K columns were distributed over NPCOL processes.
127 *
128 * The values of Lr() and Lc() may be determined via a call to the func-
129 * tion PB_Cnumroc:
130 * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
131 * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
132 *
133 * Arguments
134 * =========
135 *
136 * TYPE (local input) pointer to a PBTYP_T structure
137 * On entry, TYPE is a pointer to a structure of type PBTYP_T,
138 * that contains type information (See pblas.h).
139 *
140 * DIRECA (global input) pointer to CHAR
141 * On entry, DIRECA specifies the direction in which the rows
142 * or columns of sub( A ) and sub( C ) should be looped over as
143 * follows:
144 * DIRECA = 'F' or 'f' forward or increasing,
145 * DIRECA = 'B' or 'b' backward or decreasing.
146 *
147 * CONJUG (global input) pointer to CHAR
148 * On entry, CONJUG specifies whether sub( C ) is a symmetric or
149 * Hermitian submatrix operand as follows:
150 * CONJUG = 'N' or 'n' sub( C ) is symmetric,
151 * CONJUG = 'Z' or 'z' sub( C ) is Hermitian.
152 *
153 * UPLO (global input) pointer to CHAR
154 * On entry, UPLO specifies whether the local pieces of
155 * the array C containing the upper or lower triangular part
156 * of the submatrix sub( C ) are to be referenced as follows:
157 * UPLO = 'U' or 'u' Only the local pieces corresponding to
158 * the upper triangular part of the
159 * submatrix sub( C ) are referenced,
160 * UPLO = 'L' or 'l' Only the local pieces corresponding to
161 * the lower triangular part of the
162 * submatrix sub( C ) are referenced.
163 *
164 * TRANS (global input) pointer to CHAR
165 * On entry, TRANS specifies the operation to be performed as
166 * follows:
167 *
168 * TRANS = 'N' or 'n'
169 * sub( C ) := alpha*sub( A )*sub( A )' + beta*sub( C ),
170 * or
171 * sub( C ) := alpha*sub( A )*sub( A )' + beta*sub( C ),
172 * or
173 * sub( C ) := alpha*sub( A )*conjg( sub( A )' ) +
174 * beta*sub( C ),
175 *
176 * TRANS = 'T' or 't'
177 * sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ),
178 * or
179 * sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ),
180 *
181 * TRANS = 'C' or 'c'
182 * sub( C ) := alpha*sub( A )'*sub( A ) + beta*sub( C ),
183 * or
184 * sub( C ) := alpha*conjg( sub( A )' )*sub( A ) +
185 * beta*sub( C ).
186 *
187 * N (global input) INTEGER
188 * On entry, N specifies the order of the submatrix sub( C ).
189 * N must be at least zero.
190 *
191 * K (global input) INTEGER
192 * On entry, with TRANS = 'N' or 'n', K specifies the number of
193 * columns of the submatrix sub( A ), and with TRANS = 'T' or
194 * 't' or 'C' or 'c', K specifies the number of rows of the sub-
195 * matrix sub( A ). K must be at least zero.
196 *
197 * ALPHA (global input) pointer to CHAR
198 * On entry, ALPHA specifies the scalar alpha. When ALPHA is
199 * supplied as zero then the local entries of the array A
200 * corresponding to the entries of the submatrix sub( A ) need
201 * not be set on input.
202 *
203 * A (local input) pointer to CHAR
204 * On entry, A is an array of dimension (LLD_A, Ka), where Ka is
205 * at least Lc( 1, JA+K-1 ) when TRANS = 'N' or 'n', and is at
206 * least Lc( 1, JA+N-1 ) otherwise. Before entry, this array
207 * contains the local entries of the matrix A.
208 * Before entry with TRANS = 'N' or 'n', this array contains the
209 * local entries corresponding to the entries of the n by k sub-
210 * matrix sub( A ), otherwise the local entries corresponding to
211 * the entries of the k by n submatrix sub( A ).
212 *
213 * IA (global input) INTEGER
214 * On entry, IA specifies A's global row index, which points to
215 * the beginning of the submatrix sub( A ).
216 *
217 * JA (global input) INTEGER
218 * On entry, JA specifies A's global column index, which points
219 * to the beginning of the submatrix sub( A ).
220 *
221 * DESCA (global and local input) INTEGER array
222 * On entry, DESCA is an integer array of dimension DLEN_. This
223 * is the array descriptor for the matrix A.
224 *
225 * BETA (global input) pointer to CHAR
226 * On entry, BETA specifies the scalar beta. When BETA is
227 * supplied as zero then the local entries of the array C
228 * corresponding to the entries of the submatrix sub( C ) need
229 * not be set on input.
230 *
231 * C (local input/local output) pointer to CHAR
232 * On entry, C is an array of dimension (LLD_C, Kc), where Kc is
233 * at least Lc( 1, JC+N-1 ). Before entry, this array contains
234 * the local entries of the matrix C.
235 * Before entry with UPLO = 'U' or 'u', this array contains
236 * the local entries corresponding to the upper triangular part
237 * of the symmetric or Hermitian submatrix sub( C ), and the
238 * local entries corresponding to the strictly lower triangular
239 * of sub( C ) are not referenced. On exit, the upper triangular
240 * part of sub( C ) is overwritten by the upper triangular part
241 * of the updated submatrix.
242 * Before entry with UPLO = 'L' or 'l', this array contains
243 * the local entries corresponding to the lower triangular part
244 * of the symmetric or Hermitian submatrix sub( C ), and the
245 * local entries corresponding to the strictly upper triangular
246 * of sub( C ) are not referenced. On exit, the lower triangular
247 * part of sub( C ) is overwritten by the lower triangular part
248 * of the updated submatrix.
249 * Note that the imaginary parts of the local entries corres-
250 * ponding to the diagonal elements of sub( C ) need not be
251 * set, they are assumed to be zero, and on exit they are set
252 * to zero.
253 *
254 * IC (global input) INTEGER
255 * On entry, IC specifies C's global row index, which points to
256 * the beginning of the submatrix sub( C ).
257 *
258 * JC (global input) INTEGER
259 * On entry, JC specifies C's global column index, which points
260 * to the beginning of the submatrix sub( C ).
261 *
262 * DESCC (global and local input) INTEGER array
263 * On entry, DESCC is an integer array of dimension DLEN_. This
264 * is the array descriptor for the matrix C.
265 *
266 * -- Written on April 1, 1998 by
267 * Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
268 *
269 * ---------------------------------------------------------------------
270 */
271 /*
272 * .. Local Scalars ..
273 */
274  char GatherDir, ScatterDir, * one, top, tran, * zero;
275  int Acol, Acurcol, Acurimb1, Acurinb1, Acurrow, Afr, Aii, Aimb,
276  Aimb1, Ainb, Ainb1, Ajj, Ald, Am, Amb, Amp, Amp0, An, Anb,
277  Anq, Anq0, Arow, Ccsrc, Cimb, Cinb, Cmb, Cnb, Crsrc, WAfr,
278  WCfr, WCsum, conjg, ctxt, fwd, k, kb, kbb, kend, kstart,
279  kstep, ktmp, mycol, myrow, notran, npcol, nprow, size, upper;
280  GEMM_T gemm;
281  GSUM2D_T gsum2d;
282 /*
283 * .. Local Arrays ..
284 */
285  int Ad0[DLEN_], DBUFA[DLEN_], WAd[DLEN_], WCd[DLEN_];
286  char * Aptr = NULL, * Aptr0 = NULL, * WA = NULL, * WC = NULL;
287 /* ..
288 * .. Executable Statements ..
289 *
290 */
291 /*
292 * sub( C ) = beta * sub( C )
293 */
294  PB_Cplascal( TYPE, UPLO, CONJUG, N, N, BETA, C, IC, JC, DESCC );
295 /*
296 * Retrieve process grid information
297 */
298  Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
299 
300  fwd = ( Mupcase( DIRECA[0] ) == CFORWARD );
301  conjg = ( Mupcase( CONJUG[0] ) == CCONJG );
302  upper = ( Mupcase( UPLO [0] ) == CUPPER );
303  notran = ( Mupcase( TRANS [0] ) == CNOTRAN );
304  tran = ( conjg ? CCOTRAN : CTRAN );
305 
306  size = TYPE->size; one = TYPE->one; zero = TYPE->zero;
307  gsum2d = TYPE->Cgsum2d; gemm = TYPE->Fgemm;
308 /*
309 * Figure out the loop bounds accordingly to DIRECA
310 */
311  kb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) );
312  if( fwd )
313  {
314  kstart = 0; kend = ( ( N - 1 ) / kb + 1 ) * kb; kstep = kb;
315  GatherDir = CFORWARD; ScatterDir = CBACKWARD;
316  }
317  else
318  {
319  kstart = ( ( N - 1 ) / kb ) * kb; kend = kstep = -kb;
320  GatherDir = CBACKWARD; ScatterDir = CFORWARD;
321  }
322 /*
323 * Compute local information for A
324 */
325  if( notran ) { Am = N; An = K; } else { Am = K; An = N; }
326  PB_Cinfog2l( IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj,
327  &Arow, &Acol );
328  Aimb = DESCA[IMB_]; Ainb = DESCA[INB_];
329  Amb = DESCA[MB_ ]; Anb = DESCA[NB_ ]; Ald = DESCA[LLD_];
330 
331  Aimb1 = PB_Cfirstnb( Am, IA, Aimb, Amb );
332  Amp0 = PB_Cnumroc( Am, 0, Aimb1, Amb, myrow, Arow, nprow );
333  Ainb1 = PB_Cfirstnb( An, JA, Ainb, Anb );
334  Anq0 = PB_Cnumroc( An, 0, Ainb1, Anb, mycol, Acol, npcol );
335  if( ( Amp0 > 0 ) && ( Anq0 > 0 ) ) Aptr0 = Mptr( A, Aii, Ajj, Ald, size );
336 
337  if( notran )
338  {
339  top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
340  Cinb = DESCC[INB_]; Cnb = DESCC[NB_]; Ccsrc = DESCC[CSRC_];
341 
342  if( upper )
343  {
344  for( k = kstart; k != kend; k += kstep )
345  {
346  kbb = N - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
347 /*
348 * Accumulate A( IA+k:IA+k+kbb-1, JA:JA+K-1 )
349 */
350  PB_CGatherV( TYPE, REUSE, &GatherDir, kbb, An, A, IA+k, JA, DESCA,
351  ROW, &Aptr, DBUFA, &Afr );
352 /*
353 * Replicate A( IA+k:IA+k+kbb-1, JA:JA+K-1 ) over A( IA:IA+k+kbb-1, JA:JA+K-1 )
354 */
355  PB_Cdescset( Ad0, ktmp, An, Aimb1, Ainb1, Amb, Anb, Arow, Acol,
356  ctxt, Ald );
357  PB_CInV( TYPE, NOCONJG, ROW, ktmp, An, Ad0, kbb, Aptr, 0, 0, DBUFA,
358  ROW, &WA, WAd, &WAfr );
359 /*
360 * WC := A( IA:IA+k+kbb-1, JA:JA+K-1 ) * A( IA+k:IA+k+kbb-1, JA:JA+K-1 )'
361 */
362  PB_COutV( TYPE, COLUMN, INIT, ktmp, An, Ad0, kbb, &WC, WCd, &WCfr,
363  &WCsum );
364  Amp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow );
365  if( ( Amp > 0 ) && ( Anq0 > 0 ) )
366  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( &tran ), &Amp, &kbb, &Anq0,
367  ALPHA, Aptr0, &Ald, WA, &WAd[LLD_], zero, WC, &WCd[LLD_] );
368  if( WAfr ) free( WA );
369  if( Afr ) free( Aptr );
370 
371  if( WCsum )
372  {
373  WCd[CSRC_] = PB_Cindxg2p( JC + ( fwd ? k : ktmp - 1 ), Cinb,
374  Cnb, Ccsrc, Ccsrc, npcol );
375  if( Amp > 0 )
376  gsum2d( ctxt, ROW, &top, Amp, kbb, WC, WCd[LLD_], myrow,
377  WCd[CSRC_] );
378  }
379 /*
380 * Zero lower triangle of WC( k:k+kbb-1, 0:kbb-1 )
381 */
382  if( conjg )
383  PB_Cplapad( TYPE, LOWER, CONJG, kbb, kbb, zero, zero, WC,
384  k, 0, WCd );
385  else if( kbb > 1 )
386  PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero, WC,
387  k+1, 0, WCd );
388 /*
389 * Add WC to C( IC:IC+k+kbb-1, JC+k:JC+k+kbb-1 )
390 */
391  PB_CScatterV( TYPE, &ScatterDir, ktmp, kbb, WC, 0, 0, WCd, COLUMN,
392  one, C, IC, JC+k, DESCC, COLUMN );
393  if( WCfr ) free( WC );
394  }
395  }
396  else
397  {
398  for( k = kstart; k != kend; k += kstep )
399  {
400  ktmp = N - k; kbb = MIN( ktmp, kb );
401 /*
402 * Accumulate A( IA+k:IA+k+kbb-1, JA:JA+K-1 )
403 */
404  PB_CGatherV( TYPE, REUSE, &GatherDir, kbb, An, A, IA+k, JA, DESCA,
405  ROW, &Aptr, DBUFA, &Afr );
406 /*
407 * Replicate A( IA+k:IA+k+kbb-1, JA:JA+K-1 ) over A( IA+k:IA+N-1, JA:JA+K-1 )
408 */
409  Acurimb1 = PB_Cfirstnb( ktmp, IA+k, Aimb, Amb );
410  Acurrow = PB_Cindxg2p( k, Aimb1, Amb, Arow, Arow, nprow );
411  PB_Cdescset( Ad0, ktmp, An, Acurimb1, Ainb1, Amb, Anb, Acurrow,
412  Acol, ctxt, Ald );
413  PB_CInV( TYPE, NOCONJG, ROW, ktmp, An, Ad0, kbb, Aptr, 0, 0, DBUFA,
414  ROW, &WA, WAd, &WAfr );
415 /*
416 * WC := A( IA+k:IA+N-1, JA:JA+K-1 ) * A( IA+k:IA+k+kbb-1, JA:JA+K-1 )'
417 */
418  PB_COutV( TYPE, COLUMN, INIT, ktmp, An, Ad0, kbb, &WC, WCd, &WCfr,
419  &WCsum );
420  Amp = PB_Cnumroc( ktmp, k, Aimb1, Amb, myrow, Arow, nprow );
421  if( ( Amp > 0 ) && ( Anq0 > 0 ) )
422  gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( &tran ), &Amp, &kbb, &Anq0,
423  ALPHA, Mptr( Aptr0, Amp0-Amp, 0, Ald, size ), &Ald, WA,
424  &WAd[LLD_], zero, WC, &WCd[LLD_] );
425  if( WAfr ) free( WA );
426  if( Afr ) free( Aptr );
427 
428  if( WCsum )
429  {
430  WCd[CSRC_] = PB_Cindxg2p( JC + ( fwd ? k : k + kbb - 1 ), Cinb,
431  Cnb, Ccsrc, Ccsrc, npcol );
432  if( Amp > 0 )
433  gsum2d( ctxt, ROW, &top, Amp, kbb, WC, WCd[LLD_], myrow,
434  WCd[CSRC_] );
435  }
436 /*
437 * Zero upper triangle of WC
438 */
439  if( conjg )
440  PB_Cplapad( TYPE, UPPER, CONJG, kbb, kbb, zero, zero, WC,
441  0, 0, WCd );
442  else if( kbb > 1 )
443  PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero, WC,
444  0, 1, WCd );
445 /*
446 * Add WC to C( IC+k:IC+N-1, JC+k:JC+k+kbb-1 )
447 */
448  PB_CScatterV( TYPE, &ScatterDir, ktmp, kbb, WC, 0, 0, WCd, COLUMN,
449  one, C, IC+k, JC+k, DESCC, COLUMN );
450  if( WCfr ) free( WC );
451  }
452  }
453  }
454  else
455  {
456  top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
457  Cimb = DESCC[IMB_]; Cmb = DESCC[MB_]; Crsrc = DESCC[RSRC_];
458 
459  if( upper )
460  {
461  for( k = kstart; k != kend; k += kstep )
462  {
463  ktmp = N - k; kbb = MIN( ktmp, kb );
464 /*
465 * Accumulate A( IA:IA+K-1, JA+k:JA+k+kbb-1 )
466 */
467  PB_CGatherV( TYPE, REUSE, &GatherDir, Am, kbb, A, IA, JA+k, DESCA,
468  COLUMN, &Aptr, DBUFA, &Afr );
469 /*
470 * Replicate A( IA:IA+K-1, JA+k:JA+k+kbb-1 ) over A( IA:IA+K-1, JA+k:JA+N-1 )
471 */
472  Acurinb1 = PB_Cfirstnb( ktmp, JA+k, Ainb, Anb );
473  Acurcol = PB_Cindxg2p( k, Ainb1, Anb, Acol, Acol, npcol );
474  PB_Cdescset( Ad0, Am, ktmp, Aimb1, Acurinb1, Amb, Anb, Arow,
475  Acurcol, ctxt, Ald );
476  PB_CInV( TYPE, NOCONJG, COLUMN, Am, ktmp, Ad0, kbb, Aptr, 0, 0,
477  DBUFA, COLUMN, &WA, WAd, &WAfr );
478 /*
479 * WC := A( IA:IA+K-1, JA+k:JA+k+kbb-1 )' * A( IA:IA+K-1,JA+k:JA+N-1 )
480 */
481  PB_COutV( TYPE, ROW, INIT, Am, ktmp, Ad0, kbb, &WC, WCd, &WCfr,
482  &WCsum );
483  Anq = PB_Cnumroc( ktmp, k, Ainb1, Anb, mycol, Acol, npcol );
484  if( ( Anq > 0 ) && ( Amp0 > 0 ) )
485  gemm( C2F_CHAR( &tran ), C2F_CHAR( NOTRAN ), &kbb, &Anq, &Amp0,
486  ALPHA, WA, &WAd[LLD_], Mptr( Aptr0, 0, Anq0-Anq, Ald,
487  size ), &Ald, zero, WC, &WCd[LLD_] );
488  if( WAfr ) free( WA );
489  if( Afr ) free( Aptr );
490 
491  if( WCsum )
492  {
493  WCd[RSRC_] = PB_Cindxg2p( IC + ( fwd ? k : k + kbb - 1 ), Cimb,
494  Cmb, Crsrc, Crsrc, nprow );
495  if( Anq > 0 )
496  gsum2d( ctxt, COLUMN, &top, kbb, Anq, WC, WCd[LLD_],
497  WCd[RSRC_], mycol );
498  }
499 /*
500 * Zero lower triangle of WC
501 */
502  if( conjg )
503  PB_Cplapad( TYPE, LOWER, CONJG, kbb, kbb, zero, zero, WC,
504  0, 0, WCd );
505  else if( kbb > 1 )
506  PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero, WC,
507  1, 0, WCd );
508 /*
509 * Add WC to C( IC+k:IC+k+kbb-1, JC+k:JC+N-1 )
510 */
511  PB_CScatterV( TYPE, &ScatterDir, kbb, ktmp, WC, 0, 0, WCd, ROW, one,
512  C, IC+k, JC+k, DESCC, ROW );
513  if( WCfr ) free( WC );
514  }
515  }
516  else
517  {
518  for( k = kstart; k != kend; k += kstep )
519  {
520  kbb = N - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
521 /*
522 * Accumulate A( IA:IA+K-1, JA+k:JA+k+kbb-1 )
523 */
524  PB_CGatherV( TYPE, REUSE, &GatherDir, Am, kbb, A, IA, JA+k, DESCA,
525  COLUMN, &Aptr, DBUFA, &Afr );
526 /*
527 * Replicate A( IA:IA+K-1, JA+k:JA+k+kbb-1 ) over A( IA:IA+K-1, JA:JA+k+kbb-1 )
528 */
529  PB_Cdescset( Ad0, Am, ktmp, Aimb1, Ainb1, Amb, Anb, Arow, Acol,
530  ctxt, Ald );
531  PB_CInV( TYPE, NOCONJG, COLUMN, Am, ktmp, Ad0, kbb, Aptr, 0, 0,
532  DBUFA, COLUMN, &WA, WAd, &WAfr );
533 /*
534 * WC := A( IA:IA+K-1, JA+k:JA+k+kbb-1 )' * A( IA:IA+K-1, JA:JA+k+kbb-1 )
535 */
536  PB_COutV( TYPE, ROW, INIT, Am, ktmp, Ad0, kbb, &WC, WCd, &WCfr,
537  &WCsum );
538  Anq = PB_Cnumroc( ktmp, 0, Ainb1, Anb, mycol, Acol, npcol );
539  if( ( Anq > 0 ) && ( Amp0 > 0 ) )
540  gemm( C2F_CHAR( &tran ), C2F_CHAR( NOTRAN ), &kbb, &Anq, &Amp0,
541  ALPHA, WA, &WAd[LLD_], Aptr0, &Ald, zero, WC, &WCd[LLD_] );
542  if( WAfr ) free( WA );
543  if( Afr ) free( Aptr );
544 
545  if( WCsum )
546  {
547  WCd[RSRC_] = PB_Cindxg2p( IC + ( fwd ? k : ktmp - 1 ), Cimb,
548  Cmb, Crsrc, Crsrc, nprow );
549  if( Anq > 0 )
550  gsum2d( ctxt, COLUMN, &top, kbb, Anq, WC, WCd[LLD_],
551  WCd[RSRC_], mycol );
552  }
553 /*
554 * Zero upper triangle of WC( 0:kbb-1, k:k+kbb-1 )
555 */
556  if( conjg )
557  PB_Cplapad( TYPE, UPPER, CONJG, kbb, kbb, zero, zero, WC,
558  0, k, WCd );
559  else if( kbb > 1 )
560  PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero, WC,
561  0, k+1, WCd );
562 /*
563 * Add WC to C( IC+k:IC+k+kbb-1, JC:JC+k+kbb-1 )
564 */
565  PB_CScatterV( TYPE, &ScatterDir, kbb, ktmp, WC, 0, 0, WCd, ROW, one,
566  C, IC+k, JC, DESCC, ROW );
567  if( WCfr ) free( WC );
568  }
569  }
570  }
571 /*
572 * End of PB_CpsyrkAC
573 */
574 }
CCONJG
#define CCONJG
Definition: PBblas.h:21
TYPE
#define TYPE
Definition: clamov.c:7
ROW
#define ROW
Definition: PBblacs.h:46
MB_
#define MB_
Definition: PBtools.h:43
NB_
#define NB_
Definition: PBtools.h:44
COLUMN
#define COLUMN
Definition: PBblacs.h:45
CSRC_
#define CSRC_
Definition: PBtools.h:46
PB_CScatterV
void PB_CScatterV()
CCOTRAN
#define CCOTRAN
Definition: PBblas.h:22
NOCONJG
#define NOCONJG
Definition: PBblas.h:45
PB_Cfirstnb
int PB_Cfirstnb()
DLEN_
#define DLEN_
Definition: PBtools.h:48
GEMM_T
F_VOID_FCT(* GEMM_T)()
Definition: pblas.h:313
NOTRAN
#define NOTRAN
Definition: PBblas.h:44
LLD_
#define LLD_
Definition: PBtools.h:47
PB_CpsyrkAC
void PB_CpsyrkAC(PBTYP_T *TYPE, char *DIRECA, char *CONJUG, char *UPLO, char *TRANS, int N, int K, char *ALPHA, char *A, int IA, int JA, int *DESCA, char *BETA, char *C, int IC, int JC, int *DESCC)
Definition: PB_CpsyrkAC.c:25
UPPER
#define UPPER
Definition: PBblas.h:52
IMB_
#define IMB_
Definition: PBtools.h:41
pilaenv_
int pilaenv_()
PB_Cplascal
void PB_Cplascal()
PB_CGatherV
void PB_CGatherV()
INIT
#define INIT
Definition: PBblas.h:61
PB_Cdescset
void PB_Cdescset()
PB_Cplapad
void PB_Cplapad()
PB_Cindxg2p
int PB_Cindxg2p()
TOP_GET
#define TOP_GET
Definition: PBblacs.h:50
GSUM2D_T
void(* GSUM2D_T)()
Definition: pblas.h:282
PB_Ctop
char * PB_Ctop()
RSRC_
#define RSRC_
Definition: PBtools.h:45
CNOTRAN
#define CNOTRAN
Definition: PBblas.h:18
COMBINE
#define COMBINE
Definition: PBblacs.h:49
CONJG
#define CONJG
Definition: PBblas.h:47
PB_Cinfog2l
void PB_Cinfog2l()
PB_Cnumroc
int PB_Cnumroc()
CFORWARD
#define CFORWARD
Definition: PBblas.h:38
PB_CInV
void PB_CInV()
MIN
#define MIN(a_, b_)
Definition: PBtools.h:76
INB_
#define INB_
Definition: PBtools.h:42
LOWER
#define LOWER
Definition: PBblas.h:51
PB_COutV
void PB_COutV()
REUSE
#define REUSE
Definition: PBblas.h:67
C2F_CHAR
#define C2F_CHAR(a)
Definition: pblas.h:121
Cblacs_gridinfo
void Cblacs_gridinfo()
PBTYP_T
Definition: pblas.h:325
Mupcase
#define Mupcase(C)
Definition: PBtools.h:83
CUPPER
#define CUPPER
Definition: PBblas.h:26
CTRAN
#define CTRAN
Definition: PBblas.h:20
Mptr
#define Mptr(a_, i_, j_, lda_, siz_)
Definition: PBtools.h:132
CBACKWARD
#define CBACKWARD
Definition: PBblas.h:39
CTXT_
#define CTXT_
Definition: PBtools.h:38