SCALAPACK 2.2.2
LAPACK: Linear Algebra PACKage
Loading...
Searching...
No Matches
PB_Cpsyr2kAC.c
Go to the documentation of this file.
1/* ---------------------------------------------------------------------
2*
3* -- PBLAS auxiliary routine (version 2.0) --
4* University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5* and University of California, Berkeley.
6* April 1, 1998
7*
8* ---------------------------------------------------------------------
9*/
10/*
11* Include files
12*/
13#include "../pblas.h"
14#include "../PBpblas.h"
15#include "../PBtools.h"
16#include "../PBblacs.h"
17#include "../PBblas.h"
18
19#ifdef __STDC__
20void PB_Cpsyr2kAC( PBTYP_T * TYPE, char * DIRECAB, char * CONJUG,
21 char * UPLO, char * TRANS, Int N, Int K, char * ALPHA,
22 char * A, Int IA, Int JA, Int * DESCA, char * B,
23 Int IB, Int JB, Int * DESCB, char * BETA, char * C,
24 Int IC, Int JC, Int * DESCC )
25#else
26void PB_Cpsyr2kAC( TYPE, DIRECAB, CONJUG, UPLO, TRANS, N, K, ALPHA, A, IA,
27 JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC, DESCC )
28/*
29* .. Scalar Arguments ..
30*/
31 char * CONJUG, * DIRECAB, * TRANS, * UPLO;
32 Int IA, IB, IC, JA, JB, JC, K, N;
33 char * ALPHA, * BETA;
34 PBTYP_T * TYPE;
35/*
36* .. Array Arguments ..
37*/
38 Int * DESCA, * DESCB, * DESCC;
39 char * A, * B, * C;
40#endif
41{
42/*
43* Purpose
44* =======
45*
46* PB_Cpsyr2kAC performs one of the following symmetric or Hermitian
47* rank 2k operations
48*
49* sub( C ) := alpha*sub( A )*sub( B )' + alpha*sub( B )*sub( A )' +
50* beta*sub( C ),
51* or
52* sub( C ) := alpha*sub( A )*conjg( sub( B ) )' +
53* conjg( alpha )*sub( B )*conjg( sub( A ) )' +
54* beta*sub( C ),
55* or
56* sub( C ) := alpha*sub( A )'*sub( B ) + alpha*sub( B )'*sub( A ) +
57* beta*sub( C ),
58* or
59* sub( C ) := alpha*conjg( sub( A )' )*sub( B ) +
60* conjg( alpha )*conjg( sub( B )' )*sub( A ) +
61* beta*sub( C ),
62*
63* where
64*
65* sub( C ) denotes C(IC:IC+N-1,JC:JC+N-1),
66*
67* sub( A ) denotes A(IA:IA+N-1,JA:JA+K-1) if TRANS = 'N',
68* A(IA:IA+K-1,JA:JA+N-1) otherwise, and,
69*
70* sub( B ) denotes B(IB:IB+N-1,JB:JB+K-1) if TRANS = 'N',
71* B(IB:IB+K-1,JB:JB+N-1) otherwise.
72*
73* Alpha and beta are scalars, sub( C ) is an n by n symmetric or
74* Hermitian submatrix and sub( A ) and sub( B ) are n by k submatrices
75* in the first case and k by n submatrices in the second case.
76*
77* This is the outer-product algorithm using the logical aggregation
78* blocking technique.
79*
80* Notes
81* =====
82*
83* A description vector is associated with each 2D block-cyclicly dis-
84* tributed matrix. This vector stores the information required to
85* establish the mapping between a matrix entry and its corresponding
86* process and memory location.
87*
88* In the following comments, the character _ should be read as
89* "of the distributed matrix". Let A be a generic term for any 2D
90* block cyclicly distributed matrix. Its description vector is DESC_A:
91*
92* NOTATION STORED IN EXPLANATION
93* ---------------- --------------- ------------------------------------
94* DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
95* CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
96* the NPROW x NPCOL BLACS process grid
97* A is distributed over. The context
98* itself is global, but the handle
99* (the integer value) may vary.
100* M_A (global) DESCA[ M_ ] The number of rows in the distribu-
101* ted matrix A, M_A >= 0.
102* N_A (global) DESCA[ N_ ] The number of columns in the distri-
103* buted matrix A, N_A >= 0.
104* IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
105* block of the matrix A, IMB_A > 0.
106* INB_A (global) DESCA[ INB_ ] The number of columns of the upper
107* left block of the matrix A,
108* INB_A > 0.
109* MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
110* bute the last M_A-IMB_A rows of A,
111* MB_A > 0.
112* NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
113* bute the last N_A-INB_A columns of
114* A, NB_A > 0.
115* RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
116* row of the matrix A is distributed,
117* NPROW > RSRC_A >= 0.
118* CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
119* first column of A is distributed.
120* NPCOL > CSRC_A >= 0.
121* LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
122* array storing the local blocks of
123* the distributed matrix A,
124* IF( Lc( 1, N_A ) > 0 )
125* LLD_A >= MAX( 1, Lr( 1, M_A ) )
126* ELSE
127* LLD_A >= 1.
128*
129* Let K be the number of rows of a matrix A starting at the global in-
130* dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
131* that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
132* receive if these K rows were distributed over NPROW processes. If K
133* is the number of columns of a matrix A starting at the global index
134* JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
135* lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
136* these K columns were distributed over NPCOL processes.
137*
138* The values of Lr() and Lc() may be determined via a call to the func-
139* tion PB_Cnumroc:
140* Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
141* Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
142*
143* Arguments
144* =========
145*
146* TYPE (local input) pointer to a PBTYP_T structure
147* On entry, TYPE is a pointer to a structure of type PBTYP_T,
148* that contains type information (See pblas.h).
149*
150* DIRECAB (global input) pointer to CHAR
151* On entry, DIRECAB specifies the direction in which the rows
152* or columns of sub( A ), sub( B ) and sub( C ) should be
153* looped over as follows:
154* DIRECAB = 'F' or 'f' forward or increasing,
155* DIRECAB = 'B' or 'b' backward or decreasing.
156*
157* CONJUG (global input) pointer to CHAR
158* On entry, CONJUG specifies whether sub( C ) is a symmetric or
159* Hermitian submatrix operand as follows:
160* CONJUG = 'N' or 'n' sub( C ) is symmetric,
161* CONJUG = 'Z' or 'z' sub( C ) is Hermitian.
162*
163* UPLO (global input) pointer to CHAR
164* On entry, UPLO specifies whether the local pieces of
165* the array C containing the upper or lower triangular part
166* of the submatrix sub( C ) are to be referenced as follows:
167* UPLO = 'U' or 'u' Only the local pieces corresponding to
168* the upper triangular part of the
169* submatrix sub( C ) are referenced,
170* UPLO = 'L' or 'l' Only the local pieces corresponding to
171* the lower triangular part of the
172* submatrix sub( C ) are referenced.
173*
174* TRANS (global input) pointer to CHAR
175* On entry, TRANS specifies the operation to be performed as
176* follows:
177*
178* TRANS = 'N' or 'n'
179* sub( C ) := alpha*sub( A )*sub( B )' +
180* alpha*sub( B )*sub( A )' +
181* beta*sub( C ),
182* or
183* sub( C ) := alpha*sub( A )*sub( B )' +
184* alpha*sub( B )*sub( A )' +
185* beta*sub( C ),
186* or
187* sub( C ) := alpha*sub( A )*conjg( sub( B )' ) +
188* conjg( alpha )*sub( B )*conjg( sub( A )' ) +
189* beta*sub( C ),
190*
191* TRANS = 'T' or 't'
192* sub( C ) := alpha*sub( B )'*sub( A ) +
193* alpha*sub( A )'*sub( B ) +
194* beta*sub( C ),
195* or
196* sub( C ) := alpha*sub( B )'*sub( A ) +
197* alpha*sub( A )'*sub( B ) +
198* beta*sub( C ),
199*
200* TRANS = 'C' or 'c'
201* sub( C ) := alpha*sub( B )'*sub( A ) +
202* alpha*sub( A )'*sub( B ) +
203* beta*sub( C ),
204* or
205* sub( C ) := alpha*conjg( sub( A )' )*sub( B ) +
206* conjg( alpha )*conjg( sub( B )' )*sub( A ) +
207* beta*sub( C ).
208*
209* N (global input) INTEGER
210* On entry, N specifies the order of the submatrix sub( C ).
211* N must be at least zero.
212*
213* K (global input) INTEGER
214* On entry with TRANS = 'N' or 'n', K specifies the number of
215* columns of the submatrices sub( A ) and sub( B ), and on
216* entry with TRANS = 'T' or 't' or 'C' or 'c', K specifies the
217* number of rows of the submatrices sub( A ) and sub( B ).
218* K must be at least zero.
219*
220* ALPHA (global input) pointer to CHAR
221* On entry, ALPHA specifies the scalar alpha. When ALPHA is
222* supplied as zero then the local entries of the arrays A
223* and B corresponding to the entries of the submatrices
224* sub( A ) and sub( B ) respectively need not be set on input.
225*
226* A (local input) pointer to CHAR
227* On entry, A is an array of dimension (LLD_A, Ka), where Ka is
228* at least Lc( 1, JA+K-1 ) when TRANS = 'N' or 'n', and is at
229* least Lc( 1, JA+N-1 ) otherwise. Before entry, this array
230* contains the local entries of the matrix A.
231* Before entry with TRANS = 'N' or 'n', this array contains the
232* local entries corresponding to the entries of the n by k sub-
233* matrix sub( A ), otherwise the local entries corresponding to
234* the entries of the k by n submatrix sub( A ).
235*
236* IA (global input) INTEGER
237* On entry, IA specifies A's global row index, which points to
238* the beginning of the submatrix sub( A ).
239*
240* JA (global input) INTEGER
241* On entry, JA specifies A's global column index, which points
242* to the beginning of the submatrix sub( A ).
243*
244* DESCA (global and local input) INTEGER array
245* On entry, DESCA is an integer array of dimension DLEN_. This
246* is the array descriptor for the matrix A.
247*
248* B (local input) pointer to CHAR
249* On entry, B is an array of dimension (LLD_B, Kb), where Kb is
250* at least Lc( 1, JB+K-1 ) when TRANS = 'N' or 'n', and is at
251* least Lc( 1, JB+N-1 ) otherwise. Before entry, this array
252* contains the local entries of the matrix B.
253* Before entry with TRANS = 'N' or 'n', this array contains the
254* local entries corresponding to the entries of the n by k sub-
255* matrix sub( B ), otherwise the local entries corresponding to
256* the entries of the k by n submatrix sub( B ).
257*
258* IB (global input) INTEGER
259* On entry, IB specifies B's global row index, which points to
260* the beginning of the submatrix sub( B ).
261*
262* JB (global input) INTEGER
263* On entry, JB specifies B's global column index, which points
264* to the beginning of the submatrix sub( B ).
265*
266* DESCB (global and local input) INTEGER array
267* On entry, DESCB is an integer array of dimension DLEN_. This
268* is the array descriptor for the matrix B.
269*
270* BETA (global input) pointer to CHAR
271* On entry, BETA specifies the scalar beta. When BETA is
272* supplied as zero then the local entries of the array C
273* corresponding to the entries of the submatrix sub( C ) need
274* not be set on input.
275*
276* C (local input/local output) pointer to CHAR
277* On entry, C is an array of dimension (LLD_C, Kc), where Kc is
278* at least Lc( 1, JC+N-1 ). Before entry, this array contains
279* the local entries of the matrix C.
280* Before entry with UPLO = 'U' or 'u', this array contains
281* the local entries corresponding to the upper triangular part
282* of the symmetric or Hermitian submatrix sub( C ), and the
283* local entries corresponding to the strictly lower triangular
284* of sub( C ) are not referenced. On exit, the upper triangular
285* part of sub( C ) is overwritten by the upper triangular part
286* of the updated submatrix.
287* Before entry with UPLO = 'L' or 'l', this array contains
288* the local entries corresponding to the lower triangular part
289* of the symmetric or Hermitian submatrix sub( C ), and the
290* local entries corresponding to the strictly upper triangular
291* of sub( C ) are not referenced. On exit, the lower triangular
292* part of sub( C ) is overwritten by the lower triangular part
293* of the updated submatrix.
294* Note that the imaginary parts of the local entries corres-
295* ponding to the diagonal elements of sub( C ) need not be
296* set, they are assumed to be zero, and on exit they are set
297* to zero.
298*
299* IC (global input) INTEGER
300* On entry, IC specifies C's global row index, which points to
301* the beginning of the submatrix sub( C ).
302*
303* JC (global input) INTEGER
304* On entry, JC specifies C's global column index, which points
305* to the beginning of the submatrix sub( C ).
306*
307* DESCC (global and local input) INTEGER array
308* On entry, DESCC is an integer array of dimension DLEN_. This
309* is the array descriptor for the matrix C.
310*
311* -- Written on April 1, 1998 by
312* Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
313*
314* ---------------------------------------------------------------------
315*/
316/*
317* .. Local Scalars ..
318*/
319 char GatherDir, ScatterDir, * one, top, * talpha, * tbeta, tran,
320 * zero;
321 Int ABm, ABn, Acol, Acurcol, Acurrow, Acurimb1, Acurinb1, Afr,
322 Aii, Aimb, Aimb1, Ainb, Ainb1, AisD, AisR, Ajj, Ald, Amb,
323 Amp, Amp0, Anb, Anq, Anq0, Arow, Aspan, Bcol, Bcurcol,
324 Bcurrow, Bcurimb1, Bcurinb1, Bfr, Bii, Bimb, Bimb1, Binb,
325 Binb1, BisD, BisR, Bjj, Bld, Bmb, Bmp, Bmp0, Bnb, Bnq, Bnq0,
326 Brow, Bspan, Ccsrc, Cimb, Cinb, Cmb, Cnb, Crsrc, WAfr, WACfr,
327 WACld, WACreuse, WACsum, WBfr, WBCfr, WBCld, WBCsum, conjg,
328 ctxt, fwd, k, kb, kbb, kend, kstart, kstep, ktmp, mycol,
329 myrow, notran, npcol, nprow, size, upper;
330 GEMM_T gemm;
331 GSUM2D_T gsum2d;
332/*
333* .. Local Arrays ..
334*/
335 char * Aptr = NULL, * Aptr0 = NULL, * Bptr = NULL, * Bptr0 = NULL,
336 * WA = NULL, * WB = NULL, * WAC = NULL, *WBC = NULL;
337 Int Ad0[DLEN_], Bd0[DLEN_], DBUFA[DLEN_], DBUFB[DLEN_],
338 WAd[DLEN_], WBd[DLEN_], WACd [DLEN_], WBCd [DLEN_];
339/* ..
340* .. Executable Statements ..
341*
342*/
343/*
344* sub( C ) = beta * sub( C )
345*/
346 PB_Cplascal( TYPE, UPLO, CONJUG, N, N, BETA, C, IC, JC, DESCC );
347/*
348* Retrieve process grid information
349*/
350 Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
351
352 fwd = ( Mupcase( DIRECAB[0] ) == CFORWARD );
353 conjg = ( Mupcase( CONJUG [0] ) == CCONJG );
354 upper = ( Mupcase( UPLO [0] ) == CUPPER );
355 notran = ( Mupcase( TRANS [0] ) == CNOTRAN );
356
357 size = TYPE->size; one = TYPE->one; zero = TYPE->zero;
358 gsum2d = TYPE->Cgsum2d; gemm = TYPE->Fgemm;
359/*
360* Figure out the loop bounds accordingly to DIRECAB
361*/
362 kb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) );
363 if( fwd )
364 {
365 kstart = 0; kend = ( ( N - 1 ) / kb + 1 ) * kb; kstep = kb;
366 GatherDir = CFORWARD; ScatterDir = CBACKWARD;
367 }
368 else
369 {
370 kstart = ( ( N - 1 ) / kb ) * kb; kend = kstep = -kb;
371 GatherDir = CBACKWARD; ScatterDir = CFORWARD;
372 }
373/*
374* Compute conjg( ALPHA ) and transpose parameter for Hermitian case
375*/
376 if( conjg )
377 {
378 tran = CCOTRAN; talpha = PB_Cmalloc( size );
379 PB_Cconjg( TYPE, ALPHA, talpha );
380 }
381 else { tran = CTRAN; talpha = ALPHA; }
382/*
383* Compute local information for sub( A ) and sub( B )
384*/
385 if( notran ) { ABm = N; ABn = K; } else { ABm = K; ABn = N; }
386 PB_Cinfog2l( IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj,
387 &Arow, &Acol );
388 Aimb = DESCA[IMB_]; Ainb = DESCA[INB_];
389 Amb = DESCA[MB_ ]; Anb = DESCA[NB_ ]; Ald = DESCA[LLD_];
390 Aimb1 = PB_Cfirstnb( ABm, IA, Aimb, Amb );
391 Amp0 = PB_Cnumroc( ABm, 0, Aimb1, Amb, myrow, Arow, nprow );
392 Ainb1 = PB_Cfirstnb( ABn, JA, Ainb, Anb );
393 Anq0 = PB_Cnumroc( ABn, 0, Ainb1, Anb, mycol, Acol, npcol );
394 if( ( Amp0 > 0 ) && ( Anq0 > 0 ) ) Aptr0 = Mptr( A, Aii, Ajj, Ald, size );
395
396 PB_Cinfog2l( IB, JB, DESCB, nprow, npcol, myrow, mycol, &Bii, &Bjj,
397 &Brow, &Bcol );
398 Bimb = DESCB[IMB_]; Binb = DESCB[INB_];
399 Bmb = DESCB[MB_ ]; Bnb = DESCB[NB_ ]; Bld = DESCB[LLD_];
400 Bimb1 = PB_Cfirstnb( ABm, IB, Bimb, Bmb );
401 Bmp0 = PB_Cnumroc( ABm, 0, Bimb1, Bmb, myrow, Brow, nprow );
402 Binb1 = PB_Cfirstnb( ABn, JB, Binb, Bnb );
403 Bnq0 = PB_Cnumroc( ABn, 0, Binb1, Bnb, mycol, Bcol, npcol );
404 if( ( Bmp0 > 0 ) && ( Bnq0 > 0 ) ) Bptr0 = Mptr( B, Bii, Bjj, Bld, size );
405
406 if( notran )
407 {
408 top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
409 Cinb = DESCC[INB_]; Cnb = DESCC[NB_]; Ccsrc = DESCC[CSRC_];
410/*
411* Determine if one can reuse the WAC buffer for the intermediate local products
412* sub( A ) * sub( B )' and sub( B ) * sub( A )'.
413*/
414 AisR = ( ( Acol < 0 ) || ( npcol == 1 ) );
415 BisR = ( ( Bcol < 0 ) || ( npcol == 1 ) );
416
417 if( !( AisR ) && !( BisR ) )
418 {
419/*
420* When neither sub( A ) nor sub( B ) are replicated, WAC can be reused if
421* either sub( A ) spans more than one process column, or, neither sub( A )
422* nor sub( B ) span more than one process column, and both operands reside
423* in the same process column.
424*/
425 Aspan = PB_Cspan( ABn, 0, Ainb1, Anb, Acol, npcol );
426 Bspan = PB_Cspan( ABn, 0, Binb1, Bnb, Bcol, npcol );
427 WACreuse = ( Aspan ||
428 ( !( Aspan ) && !( Bspan ) && ( Acol == Bcol ) ) );
429 }
430 else
431 {
432/*
433* Otherwise, WAC can be reused when both operands sub( A ) and sub( B ) are
434* replicated.
435*/
436 WACreuse = ( AisR && BisR );
437 }
438/*
439* Furthermore, the ability to reuse WAC requires sub( A ) and sub( B ) to be
440* either both not row-distributed, or, both row-distributed and aligned.
441*/
442 AisD = ( ( Arow >= 0 ) && ( nprow > 1 ) );
443 BisD = ( ( Brow >= 0 ) && ( nprow > 1 ) );
444 WACreuse = ( WACreuse &&
445 ( ( !AisD && !BisD ) || ( ( AisD && BisD ) &&
446 ( ( Arow == Brow ) &&
447 ( ( ( Aimb1 >= ABm ) && ( Bimb1 >= ABm ) ) ||
448 ( ( Aimb1 == Bimb1 ) && ( Amb == Bmb ) ) ) ) ) ) );
449 tbeta = ( WACreuse ? one : zero );
450
451 if( upper )
452 {
453 for( k = kstart; k != kend; k += kstep )
454 {
455 kbb = N - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
456/*
457* Accumulate B( IB+k:IB+k+kbb-1, JB:JB+K-1 )
458*/
459 PB_CGatherV( TYPE, REUSE, &GatherDir, kbb, ABn, B, IB+k, JB, DESCB,
460 ROW, &Bptr, DBUFB, &Bfr );
461/*
462* Replicate B( IB+k:IB+k+kbb-1, JB:JB+K-1 ) over A( IA:IA+k+kbb-1, JA:JA+K-1 )
463*/
464 PB_Cdescset( Ad0, ktmp, ABn, Aimb1, Ainb1, Amb, Anb, Arow, Acol,
465 ctxt, Ald );
466 PB_CInV( TYPE, NOCONJG, ROW, ktmp, ABn, Ad0, kbb, Bptr, 0, 0, DBUFB,
467 ROW, &WB, WBd, &WBfr );
468/*
469* WAC := A( IA:IA+k+kbb-1, JA:JA+K-1 ) * B( IB+k:IB+k+kbb-1, JB:JB+K-1 )'
470*/
471 PB_COutV( TYPE, COLUMN, INIT, ktmp, ABn, Ad0, kbb, &WAC, WACd,
472 &WACfr, &WACsum );
473 WACld = WACd[LLD_];
474 Amp = PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow );
475 if( ( Amp > 0 ) && ( Anq0 > 0 ) )
476 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( &tran ), &Amp, &kbb, &Anq0,
477 ALPHA, Aptr0, &Ald, WB, &WBd[LLD_], zero, WAC, &WACld );
478 if( WBfr ) free( WB );
479 if( Bfr ) free( Bptr );
480/*
481* Accumulate A( IA+k:IA+k+kbb-1, JA:JA+K-1 )
482*/
483 PB_CGatherV( TYPE, REUSE, &GatherDir, kbb, ABn, A, IA+k, JA, DESCA,
484 ROW, &Aptr, DBUFA, &Afr );
485/*
486* Replicate A( IA+k:IA+k+kbb-1, JA:JA+K-1 ) over B( IB:IB+k+kbb-1, JB:JB+K-1 )
487*/
488 PB_Cdescset( Bd0, ktmp, ABn, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
489 ctxt, Bld );
490 PB_CInV( TYPE, NOCONJG, ROW, ktmp, ABn, Bd0, kbb, Aptr, 0, 0, DBUFA,
491 ROW, &WA, WAd, &WAfr );
492/*
493* WBC := B( IB:IB+k+kbb-1, JB:JB+K-1 ) * A( IA+k:IA+k+kbb-1, JA:JA+K-1 )'
494*/
495 if( WACreuse )
496 { WBC = WAC; MDescCopy( WACd, WBCd ); WBCfr = 0; WBCsum = WACsum; }
497 else
498 {
499 PB_COutV( TYPE, COLUMN, INIT, ktmp, ABn, Bd0, kbb, &WBC, WBCd,
500 &WBCfr, &WBCsum );
501 }
502 WBCld = WBCd[LLD_];
503 Bmp = PB_Cnumroc( ktmp, 0, Bimb1, Bmb, myrow, Brow, nprow );
504 if( ( Bmp > 0 ) && ( Bnq0 > 0 ) )
505 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( &tran ), &Bmp, &kbb, &Bnq0,
506 talpha, Bptr0, &Bld, WA, &WAd[LLD_], tbeta, WBC, &WBCld );
507 if( WAfr ) free( WA );
508 if( Afr ) free( Aptr );
509/*
510* Combine the local copies of WAC when necessary
511*/
512 if( WACsum )
513 {
514 WACd[CSRC_] = PB_Cindxg2p( JC + ( fwd ? k : ktmp - 1 ), Cinb,
515 Cnb, Ccsrc, Ccsrc, npcol );
516 if( Amp > 0 )
517 gsum2d( ctxt, ROW, &top, Amp, kbb, WAC, WACld, myrow,
518 WACd[CSRC_] );
519 }
520/*
521* Zero lower triangle of WAC( k:k+kbb-1, 0:kbb-1 )
522*/
523 if( conjg )
524 PB_Cplapad( TYPE, LOWER, CONJG, kbb, kbb, zero, zero, WAC,
525 k, 0, WACd );
526 else if( kbb > 1 )
527 PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero, WAC,
528 k+1, 0, WACd );
529/*
530* Combine the local copies of WBC when necessary
531*/
532 if( !( WACreuse ) )
533 {
534 if( WBCsum )
535 {
536 if( WACsum ) { WBCd[CSRC_] = WACd[CSRC_]; }
537 else
538 {
539 WBCd[CSRC_] = PB_Cindxg2p( JC + ( fwd ? k : ktmp - 1 ),
540 Cinb, Cnb, Ccsrc, Ccsrc,
541 npcol );
542 }
543 if( Bmp > 0 )
544 gsum2d( ctxt, ROW, &top, Bmp, kbb, WBC, WBCld, myrow,
545 WBCd[CSRC_] );
546 }
547/*
548* Zero lower triangle of WBC( k:k+kbb-1, 0:kbb-1 )
549*/
550 if( conjg )
551 PB_Cplapad( TYPE, LOWER, CONJG, kbb, kbb, zero, zero,
552 WBC, k, 0, WBCd );
553 else if( kbb > 1 )
554 PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero,
555 WBC, k+1, 0, WBCd );
556 }
557/*
558* Add WAC to C( IC:IC+k+kbb-1, JC+k:JC+k+kbb-1 )
559*/
560 PB_CScatterV( TYPE, &ScatterDir, ktmp, kbb, WAC, 0, 0, WACd, COLUMN,
561 one, C, IC, JC+k, DESCC, COLUMN );
562 if( WACfr ) free( WAC );
563/*
564* Add WBC to C( IC:IC+k+kbb-1, JC+k:JC+k+kbb-1 )
565*/
566 if( !( WACreuse ) )
567 {
568 PB_CScatterV( TYPE, &ScatterDir, ktmp, kbb, WBC, 0, 0, WBCd,
569 COLUMN, one, C, IC, JC+k, DESCC, COLUMN );
570 if( WBCfr ) free( WBC );
571 }
572 }
573 }
574 else
575 {
576 for( k = kstart; k != kend; k += kstep )
577 {
578 ktmp = N - k; kbb = MIN( ktmp, kb );
579/*
580* Accumulate B( IB+k:IB+k+kbb-1, JB:JB+K-1 )
581*/
582 PB_CGatherV( TYPE, REUSE, &GatherDir, kbb, ABn, B, IB+k, JB, DESCB,
583 ROW, &Bptr, DBUFB, &Bfr );
584/*
585* Replicate B( IB+k:IB+k+kbb-1, JB:JB+K-1 ) over A( IA+k:IA+N-1, JA:JA+K-1 )
586*/
587 Acurimb1 = PB_Cfirstnb( ktmp, IA+k, Aimb, Amb );
588 Acurrow = PB_Cindxg2p( k, Aimb1, Amb, Arow, Arow, nprow );
589 PB_Cdescset( Ad0, ktmp, ABn, Acurimb1, Ainb1, Amb, Anb, Acurrow,
590 Acol, ctxt, Ald );
591 PB_CInV( TYPE, NOCONJG, ROW, ktmp, ABn, Ad0, kbb, Bptr, 0, 0, DBUFB,
592 ROW, &WB, WBd, &WBfr );
593/*
594* WAC := A( IA+k:IA+N-1, JA:JA+K-1 ) * B( IB+k:IB+k+kbb-1, JB:JB+K-1 )'
595*/
596 PB_COutV( TYPE, COLUMN, INIT, ktmp, ABn, Ad0, kbb, &WAC, WACd,
597 &WACfr, &WACsum );
598 WACld = WACd[LLD_];
599 Amp = PB_Cnumroc( ktmp, k, Aimb1, Amb, myrow, Arow, nprow );
600 if( ( Amp > 0 ) && ( Anq0 > 0 ) )
601 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( &tran ), &Amp, &kbb, &Anq0,
602 ALPHA, Mptr( Aptr0, Amp0-Amp, 0, Ald, size ), &Ald, WB,
603 &WBd[LLD_], zero, WAC, &WACld );
604 if( WBfr ) free( WB );
605 if( Bfr ) free( Bptr );
606/*
607* Accumulate A( IA+k:IA+k+kbb-1, JA:JA+K-1 )
608*/
609 PB_CGatherV( TYPE, REUSE, &GatherDir, kbb, ABn, A, IA+k, JA, DESCA,
610 ROW, &Aptr, DBUFA, &Afr );
611/*
612* Replicate A( IA+k:IA+k+kbb-1, JA:JA+K-1 ) over B( IB+k:IB+N-1, JB:JB+K-1 )
613*/
614 Bcurimb1 = PB_Cfirstnb( ktmp, IB+k, Bimb, Bmb );
615 Bcurrow = PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow, nprow );
616 PB_Cdescset( Bd0, ktmp, ABn, Bcurimb1, Binb1, Bmb, Bnb, Bcurrow,
617 Bcol, ctxt, Bld );
618 PB_CInV( TYPE, NOCONJG, ROW, ktmp, ABn, Bd0, kbb, Aptr, 0, 0, DBUFA,
619 ROW, &WA, WAd, &WAfr );
620/*
621* WBC := B( IB+k:IB+N-1, JB:JB+K-1 ) * A( IA+k:IA+k+kbb-1, JA:JA+K-1 )'
622*/
623 if( WACreuse )
624 { WBC = WAC; MDescCopy( WACd, WBCd ); WBCfr = 0; WBCsum = WACsum; }
625 else
626 {
627 PB_COutV( TYPE, COLUMN, INIT, ktmp, ABn, Bd0, kbb, &WBC, WBCd,
628 &WBCfr, &WBCsum );
629 }
630 WBCld = WBCd[LLD_];
631 Bmp = PB_Cnumroc( ktmp, k, Bimb1, Bmb, myrow, Brow, nprow );
632 if( ( Bmp > 0 ) && ( Bnq0 > 0 ) )
633 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( &tran ), &Bmp, &kbb, &Bnq0,
634 talpha, Mptr( Bptr0, Bmp0-Bmp, 0, Bld, size ), &Bld, WA,
635 &WAd[LLD_], tbeta, WBC, &WBCld );
636 if( WAfr ) free( WA );
637 if( Afr ) free( Aptr );
638/*
639* Combine the local copies of WAC when necessary
640*/
641 if( WACsum )
642 {
643 WACd[CSRC_] = PB_Cindxg2p( JC + ( fwd ? k : k + kbb - 1 ), Cinb,
644 Cnb, Ccsrc, Ccsrc, npcol );
645 if( Amp > 0 )
646 gsum2d( ctxt, ROW, &top, Amp, kbb, WAC, WACld, myrow,
647 WACd[CSRC_] );
648 }
649/*
650* Zero upper triangle of WAC( 0:kbb-1, 0:kbb-1 )
651*/
652 if( conjg )
653 PB_Cplapad( TYPE, UPPER, CONJG, kbb, kbb, zero, zero, WAC,
654 0, 0, WACd );
655 else if( kbb > 1 )
656 PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero, WAC,
657 0, 1, WACd );
658/*
659* Combine the local copies of WBC when necessary
660*/
661 if( !( WACreuse ) )
662 {
663 if( WBCsum )
664 {
665 if( WACsum ) { WBCd[CSRC_] = WACd[CSRC_]; }
666 else
667 {
668 WBCd[CSRC_] = PB_Cindxg2p( JC + ( fwd ? k : k+kbb-1 ),
669 Cinb, Cnb, Ccsrc, Ccsrc,
670 npcol );
671 }
672 if( Bmp > 0 )
673 gsum2d( ctxt, ROW, &top, Bmp, kbb, WBC, WBCld, myrow,
674 WBCd[CSRC_] );
675 }
676/*
677* Zero upper triangle of WBC( 0:kbb-1, 0:kbb-1 )
678*/
679 if( conjg )
680 PB_Cplapad( TYPE, UPPER, CONJG, kbb, kbb, zero, zero,
681 WBC, 0, 0, WBCd );
682 else if( kbb > 1 )
683 PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero,
684 WBC, 0, 1, WBCd );
685 }
686/*
687* Add WAC to C( IC+k:IC+N-1, JC+k:JC+k+kbb-1 )
688*/
689 PB_CScatterV( TYPE, &ScatterDir, ktmp, kbb, WAC, 0, 0, WACd, COLUMN,
690 one, C, IC+k, JC+k, DESCC, COLUMN );
691 if( WACfr ) free( WAC );
692/*
693* Add WBC to C( IC+k:IC+N-1, JC+k:JC+k+kbb-1 )
694*/
695 if( !( WACreuse ) )
696 {
697 PB_CScatterV( TYPE, &ScatterDir, ktmp, kbb, WBC, 0, 0, WBCd,
698 COLUMN, one, C, IC+k, JC+k, DESCC, COLUMN );
699 if( WBCfr ) free( WBC );
700 }
701 }
702 }
703 }
704 else
705 {
706 top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
707 Cimb = DESCC[IMB_]; Cmb = DESCC[MB_]; Crsrc = DESCC[RSRC_];
708/*
709* Determine if one can reuse the WAC buffer for the intermediate local products
710* sub( A )' * sub( B ) and sub( B )' * sub( A ).
711*/
712 AisR = ( ( Arow < 0 ) || ( nprow == 1 ) );
713 BisR = ( ( Brow < 0 ) || ( nprow == 1 ) );
714/*
715* When neither sub( A ) nor sub( B ) are replicated, WAC can be reused if
716* either sub( A ) spans more than one process row, or, neither sub( A ) nor
717* sub( B ) span more than one process row, and both operands reside in the
718* same process row.
719*/
720 if( !( AisR ) && !( BisR ) )
721 {
722 Aspan = PB_Cspan( ABm, 0, Aimb1, Amb, Arow, nprow );
723 Bspan = PB_Cspan( ABm, 0, Bimb1, Bmb, Brow, nprow );
724 WACreuse = ( Aspan ||
725 ( !( Aspan ) && !( Bspan ) && ( Arow == Brow ) ) );
726 }
727 else
728 {
729/*
730* Otherwise, WAC can be reused when both operands sub( A ) and sub( B ) are
731* replicated.
732*/
733 WACreuse = ( AisR && BisR );
734 }
735/*
736* Furthermore, the ability to reuse WAC requires sub( A ) and sub( B ) to be
737* either both not column-distributed, or, both column-distributed and aligned.
738*/
739 AisD = ( ( Acol >= 0 ) && ( npcol > 1 ) );
740 BisD = ( ( Bcol >= 0 ) && ( npcol > 1 ) );
741 WACreuse = ( WACreuse &&
742 ( ( !AisD && !BisD ) || ( ( AisD && BisD ) &&
743 ( ( Acol == Bcol ) &&
744 ( ( ( Ainb1 >= ABn ) && ( Binb1 >= ABn ) ) ||
745 ( ( Ainb1 == Binb1 ) && ( Anb == Bnb ) ) ) ) ) ) );
746 tbeta = ( WACreuse ? one : zero );
747
748 if( upper )
749 {
750 for( k = kstart; k != kend; k += kstep )
751 {
752 ktmp = N - k; kbb = MIN( ktmp, kb );
753/*
754* Accumulate B( IB:IB+K-1, JB+k:JB+k+kbb-1 )
755*/
756 PB_CGatherV( TYPE, REUSE, &GatherDir, ABm, kbb, B, IB, JB+k, DESCB,
757 COLUMN, &Bptr, DBUFB, &Bfr );
758/*
759* Replicate B( IB:IB+K-1, JB+k:JB+k+kbb-1 ) over A( IA:IA+K-1, JA+k:JA+N-1 )
760*/
761 Acurinb1 = PB_Cfirstnb( ktmp, JA+k, Ainb, Anb );
762 Acurcol = PB_Cindxg2p( k, Ainb1, Anb, Acol, Acol, npcol );
763 PB_Cdescset( Ad0, ABm, ktmp, Aimb1, Acurinb1, Amb, Anb, Arow,
764 Acurcol, ctxt, Ald );
765 PB_CInV( TYPE, NOCONJG, COLUMN, ABm, ktmp, Ad0, kbb, Bptr, 0, 0,
766 DBUFB, COLUMN, &WB, WBd, &WBfr );
767/*
768* WAC := B( IB:IB+K-1, JB+k:JB+k+kbb-1 )' * A( IA:IA+K-1, JA+k:JA+N-1 )
769*/
770 PB_COutV( TYPE, ROW, INIT, ABm, ktmp, Ad0, kbb, &WAC, WACd, &WACfr,
771 &WACsum );
772 WACld = WACd[LLD_];
773 Anq = PB_Cnumroc( ktmp, k, Ainb1, Anb, mycol, Acol, npcol );
774 if( ( Anq > 0 ) && ( Amp0 > 0 ) )
775 gemm( C2F_CHAR( &tran ), C2F_CHAR( NOTRAN ), &kbb, &Anq, &Amp0,
776 talpha, WB, &WBd[LLD_], Mptr( Aptr0, 0, Anq0-Anq, Ald,
777 size ), &Ald, zero, WAC, &WACld );
778 if( WBfr ) free( WB );
779 if( Bfr ) free( Bptr );
780/*
781* Accumulate A( IA:IA+K-1, JA+k:JA+k+kbb-1 )
782*/
783 PB_CGatherV( TYPE, REUSE, &GatherDir, ABm, kbb, A, IA, JA+k, DESCA,
784 COLUMN, &Aptr, DBUFA, &Afr );
785/*
786* Replicate A( IA:IA+K-1, JA+k:JA+k+kbb-1 ) over B( IB:IB+K-1, JB+k:JB+N-1 )
787*/
788 Bcurinb1 = PB_Cfirstnb( ktmp, JB+k, Binb, Bnb );
789 Bcurcol = PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol, npcol );
790 PB_Cdescset( Bd0, ABm, ktmp, Bimb1, Bcurinb1, Bmb, Bnb, Brow,
791 Bcurcol, ctxt, Bld );
792 PB_CInV( TYPE, NOCONJG, COLUMN, ABm, ktmp, Bd0, kbb, Aptr, 0, 0,
793 DBUFA, COLUMN, &WA, WAd, &WAfr );
794/*
795* WBC := A( IA:IA+K-1, JA+k:JA+k+kbb-1 )' * B( IB:IB+K-1, JB+k:JB+N-1 )
796*/
797 if( WACreuse )
798 { WBC = WAC; MDescCopy( WACd, WBCd ); WBCfr = 0; WBCsum = WACsum; }
799 else
800 {
801 PB_COutV( TYPE, ROW, INIT, ABm, ktmp, Bd0, kbb, &WBC, WBCd,
802 &WBCfr, &WBCsum );
803 }
804 WBCld = WBCd[LLD_];
805 Bnq = PB_Cnumroc( ktmp, k, Binb1, Bnb, mycol, Bcol, npcol );
806 if( ( Bnq > 0 ) && ( Bmp0 > 0 ) )
807 gemm( C2F_CHAR( &tran ), C2F_CHAR( NOTRAN ), &kbb, &Bnq, &Bmp0,
808 ALPHA, WA, &WAd[LLD_], Mptr( Bptr0, 0, Bnq0-Bnq, Bld,
809 size ), &Bld, tbeta, WBC, &WBCld );
810 if( WAfr ) free( WA );
811 if( Afr ) free( Aptr );
812/*
813* Combine the local copies of WAC when necessary
814*/
815 if( WACsum )
816 {
817 WACd[RSRC_] = PB_Cindxg2p( IC + ( fwd ? k : k + kbb - 1 ), Cimb,
818 Cmb, Crsrc, Crsrc, nprow );
819 if( Anq > 0 )
820 gsum2d( ctxt, COLUMN, &top, kbb, Anq, WAC, WACld, WACd[RSRC_],
821 mycol );
822 }
823/*
824* Zero lower triangle of WBC( 0:kbb-1, 0:kbb-1 )
825*/
826 if( conjg )
827 PB_Cplapad( TYPE, LOWER, CONJG, kbb, kbb, zero, zero, WAC,
828 0, 0, WACd );
829 else if( kbb > 1 )
830 PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero, WAC,
831 1, 0, WACd );
832/*
833* Combine the local copies of WBC when necessary
834*/
835 if( !( WACreuse ) )
836 {
837 if( WBCsum )
838 {
839 if( WACsum ) { WBCd[RSRC_] = WACd[RSRC_]; }
840 else
841 {
842 WBCd[RSRC_] = PB_Cindxg2p( IC + ( fwd ? k : k + kbb - 1 ),
843 Cimb, Cmb, Crsrc, Crsrc,
844 nprow );
845 }
846 if( Bnq > 0 )
847 gsum2d( ctxt, COLUMN, &top, kbb, Bnq, WBC, WBCld,
848 WBCd[RSRC_], mycol );
849 }
850/*
851* Zero lower triangle of WBC( 0:kbb-1, 0:kbb-1 )
852*/
853 if( conjg )
854 PB_Cplapad( TYPE, LOWER, CONJG, kbb, kbb, zero, zero,
855 WBC, 0, 0, WBCd );
856 else if( kbb > 1 )
857 PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero,
858 WBC, 1, 0, WBCd );
859 }
860/*
861* Add WAC to C( IC+k:IC+k+kbb-1, JC+k:JC+N-1 )
862*/
863 PB_CScatterV( TYPE, &ScatterDir, kbb, ktmp, WAC, 0, 0, WACd, ROW,
864 one, C, IC+k, JC+k, DESCC, ROW );
865 if( WACfr ) free( WAC );
866/*
867* Add WBC to C( IC+k:IC+k+kbb-1, JC+k:JC+N-1 )
868*/
869 if( !( WACreuse ) )
870 {
871 PB_CScatterV( TYPE, &ScatterDir, kbb, ktmp, WBC, 0, 0, WBCd, ROW,
872 one, C, IC+k, JC+k, DESCC, ROW );
873 if( WBCfr ) free( WBC );
874 }
875 }
876 }
877 else
878 {
879 for( k = kstart; k != kend; k += kstep )
880 {
881 kbb = N - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
882/*
883* Accumulate B( IB:IB+K-1, JB+k:JB+k+kbb-1 )
884*/
885 PB_CGatherV( TYPE, REUSE, &GatherDir, ABm, kbb, B, IB, JB+k, DESCB,
886 COLUMN, &Bptr, DBUFB, &Bfr );
887/*
888* Replicate B( IB:IB+K-1, JB+k:JB+k+kbb-1 ) over A( IA:IA+K-1, JA:JA+k+kbb-1 )
889*/
890 PB_Cdescset( Ad0, ABm, ktmp, Aimb1, Ainb1, Amb, Anb, Arow, Acol,
891 ctxt, Ald );
892 PB_CInV( TYPE, NOCONJG, COLUMN, ABm, ktmp, Ad0, kbb, Bptr, 0, 0,
893 DBUFB, COLUMN, &WB, WBd, &WBfr );
894/*
895* WAC := B( IB:IB+K-1, JB+k:JB+k+kbb-1 )' * A( IA:IA+K-1, JA:JA+k+kbb-1 )
896*/
897 PB_COutV( TYPE, ROW, INIT, ABm, ktmp, Ad0, kbb, &WAC, WACd, &WACfr,
898 &WACsum );
899 WACld = WACd[LLD_];
900 Anq = PB_Cnumroc( ktmp, 0, Ainb1, Anb, mycol, Acol, npcol );
901 if( ( Anq > 0 ) && ( Amp0 > 0 ) )
902 gemm( C2F_CHAR( &tran ), C2F_CHAR( NOTRAN ), &kbb, &Anq, &Amp0,
903 talpha, WB, &WBd[LLD_], Aptr0, &Ald, zero, WAC, &WACld );
904 if( WBfr ) free( WB );
905 if( Bfr ) free( Bptr );
906/*
907* Accumulate A( IA:IA+K-1, JA+k:JA+k+kbb-1 )
908*/
909 PB_CGatherV( TYPE, REUSE, &GatherDir, ABm, kbb, A, IA, JA+k, DESCA,
910 COLUMN, &Aptr, DBUFA, &Afr );
911/*
912* Replicate A( IA:IA+K-1, JA+k:JA+k+kbb-1 ) over B( IB:IB+K-1, JB:JB+k+kbb-1 )
913*/
914 PB_Cdescset( Bd0, ABm, ktmp, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
915 ctxt, Bld );
916 PB_CInV( TYPE, NOCONJG, COLUMN, ABm, ktmp, Bd0, kbb, Aptr, 0, 0,
917 DBUFA, COLUMN, &WA, WAd, &WAfr );
918/*
919* WBC := A( IA:IA+K-1, JA+k:JA+k+kbb-1 )' * B( IB:IB+K-1, JB:JB+k+kbb-1 )
920*/
921 if( WACreuse )
922 { WBC = WAC; MDescCopy( WACd, WBCd ); WBCfr = 0; WBCsum = WACsum; }
923 else
924 {
925 PB_COutV( TYPE, ROW, INIT, ABm, ktmp, Bd0, kbb, &WBC, WBCd,
926 &WBCfr, &WBCsum );
927 }
928 WBCld = WBCd[LLD_];
929 Bnq = PB_Cnumroc( ktmp, 0, Binb1, Bnb, mycol, Bcol, npcol );
930 if( ( Bnq > 0 ) && ( Bmp0 > 0 ) )
931 gemm( C2F_CHAR( &tran ), C2F_CHAR( NOTRAN ), &kbb, &Bnq, &Bmp0,
932 ALPHA, WA, &WAd[LLD_], Bptr0, &Bld, tbeta, WBC, &WBCld );
933 if( WAfr ) free( WA );
934 if( Afr ) free( Aptr );
935/*
936* Combine the local copies of WAC when necessary
937*/
938 if( WACsum )
939 {
940 WACd[RSRC_] = PB_Cindxg2p( IC + ( fwd ? k : ktmp - 1 ), Cimb,
941 Cmb, Crsrc, Crsrc, nprow );
942 if( Anq > 0 )
943 gsum2d( ctxt, COLUMN, &top, kbb, Anq, WAC, WACld, WACd[RSRC_],
944 mycol );
945 }
946/*
947* Zero upper triangle of WBC( 0:kbb-1, k:k+kbb-1 )
948*/
949 if( conjg )
950 PB_Cplapad( TYPE, UPPER, CONJG, kbb, kbb, zero, zero, WAC,
951 0, k, WACd );
952 else if( kbb > 1 )
953 PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero, WAC,
954 0, k+1, WACd );
955/*
956* Combine the local copies of WBC when necessary
957*/
958 if( !( WACreuse ) )
959 {
960 if( WBCsum )
961 {
962 if( WACsum ) { WBCd[RSRC_] = WACd[RSRC_]; }
963 else
964 {
965 WBCd[RSRC_] = PB_Cindxg2p( IC + ( fwd ? k : ktmp - 1 ),
966 Cimb, Cmb, Crsrc, Crsrc,
967 nprow );
968 }
969 if( Bnq > 0 )
970 gsum2d( ctxt, COLUMN, &top, kbb, Bnq, WBC, WBCld,
971 WBCd[RSRC_], mycol );
972 }
973/*
974* Zero upper triangle of WBC( 0:kbb-1, k:k+kbb-1 )
975*/
976 if( conjg )
977 PB_Cplapad( TYPE, UPPER, CONJG, kbb, kbb, zero, zero,
978 WBC, 0, k, WBCd );
979 else if( kbb > 1 )
980 PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero,
981 WBC, 0, k+1, WBCd );
982 }
983/*
984* Add WAC to C( IC+k:IC+k+kbb-1, JC:JC+k+kbb-1 )
985*/
986 PB_CScatterV( TYPE, &ScatterDir, kbb, ktmp, WAC, 0, 0, WACd, ROW,
987 one, C, IC+k, JC, DESCC, ROW );
988 if( WACfr ) free( WAC );
989/*
990* Add WBC to C( IC+k:IC+k+kbb-1, JC:JC+k+kbb-1 )
991*/
992 if( !( WACreuse ) )
993 {
994 PB_CScatterV( TYPE, &ScatterDir, kbb, ktmp, WBC, 0, 0, WBCd, ROW,
995 one, C, IC+k, JC, DESCC, ROW );
996 if( WBCfr ) free( WBC );
997 }
998 }
999 }
1000 }
1001 if( conjg ) free( talpha );
1002/*
1003* End of PB_Cpsyr2kAC
1004*/
1005}
#define Int
Definition Bconfig.h:22
F_VOID_FCT(* GEMM_T)()
Definition pblas.h:317
void(* GSUM2D_T)()
Definition pblas.h:286
#define C2F_CHAR(a)
Definition pblas.h:125
#define TOP_GET
Definition PBblacs.h:50
#define COLUMN
Definition PBblacs.h:45
#define COMBINE
Definition PBblacs.h:49
#define ROW
Definition PBblacs.h:46
void Cblacs_gridinfo()
#define NOTRAN
Definition PBblas.h:44
#define CONJG
Definition PBblas.h:47
#define CCONJG
Definition PBblas.h:21
#define CBACKWARD
Definition PBblas.h:39
#define NOCONJG
Definition PBblas.h:45
#define CUPPER
Definition PBblas.h:26
#define CNOTRAN
Definition PBblas.h:18
#define CTRAN
Definition PBblas.h:20
#define LOWER
Definition PBblas.h:51
#define REUSE
Definition PBblas.h:67
#define CCOTRAN
Definition PBblas.h:22
#define INIT
Definition PBblas.h:61
#define UPPER
Definition PBblas.h:52
#define CFORWARD
Definition PBblas.h:38
#define pilaenv_
Definition PBpblas.h:44
#define CTXT_
Definition PBtools.h:38
Int PB_Cfirstnb()
#define MB_
Definition PBtools.h:43
char * PB_Cmalloc()
void PB_Cinfog2l()
#define MDescCopy(DescIn, DescOut)
Definition PBtools.h:514
#define MIN(a_, b_)
Definition PBtools.h:76
#define Mptr(a_, i_, j_, lda_, siz_)
Definition PBtools.h:132
#define LLD_
Definition PBtools.h:47
void PB_CGatherV()
Int PB_Cnumroc()
char * PB_Ctop()
void PB_CInV()
void PB_CScatterV()
void PB_Cplapad()
void PB_Cplascal()
#define RSRC_
Definition PBtools.h:45
void PB_Cdescset()
void PB_COutV()
#define INB_
Definition PBtools.h:42
#define CSRC_
Definition PBtools.h:46
#define IMB_
Definition PBtools.h:41
Int PB_Cindxg2p()
#define Mupcase(C)
Definition PBtools.h:83
#define DLEN_
Definition PBtools.h:48
#define NB_
Definition PBtools.h:44
void PB_Cconjg()
Int PB_Cspan()
void PB_Cpsyr2kAC()
#define TYPE
Definition clamov.c:7