SCALAPACK 2.2.2
LAPACK: Linear Algebra PACKage
Loading...
Searching...
No Matches
PB_CpgemmAC.c
Go to the documentation of this file.
1/* ---------------------------------------------------------------------
2*
3* -- PBLAS auxiliary routine (version 2.0) --
4* University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5* and University of California, Berkeley.
6* April 1, 1998
7*
8* ---------------------------------------------------------------------
9*/
10/*
11* Include files
12*/
13#include "../pblas.h"
14#include "../PBpblas.h"
15#include "../PBtools.h"
16#include "../PBblacs.h"
17#include "../PBblas.h"
18
19#ifdef __STDC__
20void PB_CpgemmAC( PBTYP_T * TYPE, char * DIRECA, char * DIRECC,
21 char * TRANSA, char * TRANSB, Int M, Int N, Int K,
22 char * ALPHA, char * A, Int IA, Int JA, Int * DESCA,
23 char * B, Int IB, Int JB, Int * DESCB, char * BETA,
24 char * C, Int IC, Int JC, Int * DESCC )
25#else
26void PB_CpgemmAC( TYPE, DIRECA, DIRECC, TRANSA, TRANSB, M, N, K, ALPHA,
27 A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC,
28 DESCC )
29/*
30* .. Scalar Arguments ..
31*/
32 char * DIRECA, * DIRECC, * TRANSA, * TRANSB;
33 Int IA, IB, IC, JA, JB, JC, K, M, N;
34 char * ALPHA, * BETA;
35 PBTYP_T * TYPE;
36/*
37* .. Array Arguments ..
38*/
39 Int * DESCA, * DESCB, * DESCC;
40 char * A, * B, * C;
41#endif
42{
43/*
44* Purpose
45* =======
46*
47* PB_CpgemmAC performs one of the matrix-matrix operations
48*
49* sub( C ) := alpha*op( sub( A ) )*op( sub( B ) ) + beta*sub( C ),
50*
51* where
52*
53* sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), and, op( X ) is one of
54* op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ).
55*
56* Thus, op( sub( A ) ) denotes A(IA:IA+M-1,JA:JA+K-1) if TRANSA = 'N',
57* A(IA:IA+K-1,JA:JA+M-1)' if TRANSA = 'T',
58* conjg(A(IA:IA+K-1,JA:JA+M-1)') if TRANSA = 'C',
59*
60* and, op( sub( B ) ) denotes B(IB:IB+K-1,JB:JB+N-1) if TRANSB = 'N',
61* B(IB:IB+N-1,JB:JB+K-1)' if TRANSB = 'T',
62* conjg(B(IB:IB+N-1,JB:JB+K-1)') if TRANSB = 'C'.
63*
64* Alpha and beta are scalars. A, B and C are matrices; op( sub( A ) )
65* is an m by k submatrix, op( sub( B ) ) is an k by n submatrix and
66* sub( C ) is an m by n submatrix.
67*
68* This is the inner-product algorithm using the logical LCM algorithmic
69* blocking technique. The submatrix operand sub( B ) stays in place.
70*
71* Notes
72* =====
73*
74* A description vector is associated with each 2D block-cyclicly dis-
75* tributed matrix. This vector stores the information required to
76* establish the mapping between a matrix entry and its corresponding
77* process and memory location.
78*
79* In the following comments, the character _ should be read as
80* "of the distributed matrix". Let A be a generic term for any 2D
81* block cyclicly distributed matrix. Its description vector is DESC_A:
82*
83* NOTATION STORED IN EXPLANATION
84* ---------------- --------------- ------------------------------------
85* DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
86* CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
87* the NPROW x NPCOL BLACS process grid
88* A is distributed over. The context
89* itself is global, but the handle
90* (the integer value) may vary.
91* M_A (global) DESCA[ M_ ] The number of rows in the distribu-
92* ted matrix A, M_A >= 0.
93* N_A (global) DESCA[ N_ ] The number of columns in the distri-
94* buted matrix A, N_A >= 0.
95* IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
96* block of the matrix A, IMB_A > 0.
97* INB_A (global) DESCA[ INB_ ] The number of columns of the upper
98* left block of the matrix A,
99* INB_A > 0.
100* MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
101* bute the last M_A-IMB_A rows of A,
102* MB_A > 0.
103* NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
104* bute the last N_A-INB_A columns of
105* A, NB_A > 0.
106* RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
107* row of the matrix A is distributed,
108* NPROW > RSRC_A >= 0.
109* CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
110* first column of A is distributed.
111* NPCOL > CSRC_A >= 0.
112* LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
113* array storing the local blocks of
114* the distributed matrix A,
115* IF( Lc( 1, N_A ) > 0 )
116* LLD_A >= MAX( 1, Lr( 1, M_A ) )
117* ELSE
118* LLD_A >= 1.
119*
120* Let K be the number of rows of a matrix A starting at the global in-
121* dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
122* that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
123* receive if these K rows were distributed over NPROW processes. If K
124* is the number of columns of a matrix A starting at the global index
125* JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
126* lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
127* these K columns were distributed over NPCOL processes.
128*
129* The values of Lr() and Lc() may be determined via a call to the func-
130* tion PB_Cnumroc:
131* Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
132* Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
133*
134* Arguments
135* =========
136*
137* TYPE (local input) pointer to a PBTYP_T structure
138* On entry, TYPE is a pointer to a structure of type PBTYP_T,
139* that contains type information (See pblas.h).
140*
141* DIRECA (global input) pointer to CHAR
142* On entry, DIRECA specifies the direction in which the rows
143* or columns of sub( A ) should be looped over as follows:
144* DIRECA = 'F' or 'f' forward or increasing,
145* DIRECA = 'B' or 'b' backward or decreasing.
146*
147* DIRECC (global input) pointer to CHAR
148* On entry, DIRECC specifies the direction in which the rows
149* or columns of sub( C ) should be looped over as follows:
150* DIRECC = 'F' or 'f' forward or increasing,
151* DIRECC = 'B' or 'b' backward or decreasing.
152*
153* TRANSA (global input) pointer to CHAR
154* On entry, TRANSA specifies the form of op( sub( A ) ) to be
155* used in the matrix multiplication as follows:
156*
157* TRANSA = 'N' or 'n' op( sub( A ) ) = sub( A ),
158* TRANSA = 'T' or 't' op( sub( A ) ) = sub( A )',
159* TRANSA = 'C' or 'c' op( sub( A ) ) = conjg( sub( A )' ).
160*
161* TRANSB (global input) pointer to CHAR
162* On entry, TRANSB specifies the form of op( sub( B ) ) to be
163* used in the matrix multiplication as follows:
164*
165* TRANSB = 'N' or 'n' op( sub( B ) ) = sub( B ),
166* TRANSB = 'T' or 't' op( sub( B ) ) = sub( B )',
167* TRANSB = 'C' or 'c' op( sub( B ) ) = conjg( sub( B )' ).
168*
169* M (global input) INTEGER
170* On entry, M specifies the number of rows of the submatrix
171* op( sub( A ) ) and of the submatrix sub( C ). M must be at
172* least zero.
173*
174* N (global input) INTEGER
175* On entry, N specifies the number of columns of the submatrix
176* op( sub( B ) ) and the number of columns of the submatrix
177* sub( C ). N must be at least zero.
178*
179* K (global input) INTEGER
180* On entry, K specifies the number of columns of the submatrix
181* op( sub( A ) ) and the number of rows of the submatrix
182* op( sub( B ) ). K must be at least zero.
183*
184* ALPHA (global input) pointer to CHAR
185* On entry, ALPHA specifies the scalar alpha. When ALPHA is
186* supplied as zero then the local entries of the arrays A and
187* B corresponding to the entries of the submatrices sub( A )
188* and sub( B ) respectively need not be set on input.
189*
190* A (local input) pointer to CHAR
191* On entry, A is an array of dimension (LLD_A, Ka), where Ka is
192* at least Lc( 1, JA+K-1 ) when TRANSA = 'N' or 'n', and is at
193* least Lc( 1, JA+M-1 ) otherwise. Before entry, this array
194* contains the local entries of the matrix A.
195*
196* IA (global input) INTEGER
197* On entry, IA specifies A's global row index, which points to
198* the beginning of the submatrix sub( A ).
199*
200* JA (global input) INTEGER
201* On entry, JA specifies A's global column index, which points
202* to the beginning of the submatrix sub( A ).
203*
204* DESCA (global and local input) INTEGER array
205* On entry, DESCA is an integer array of dimension DLEN_. This
206* is the array descriptor for the matrix A.
207*
208* B (local input) pointer to CHAR
209* On entry, B is an array of dimension (LLD_B, Kb), where Kb is
210* at least Lc( 1, JB+N-1 ) when TRANSB = 'N' or 'n', and is at
211* least Lc( 1, JB+K-1 ) otherwise. Before entry, this array
212* contains the local entries of the matrix B.
213*
214* IB (global input) INTEGER
215* On entry, IB specifies B's global row index, which points to
216* the beginning of the submatrix sub( B ).
217*
218* JB (global input) INTEGER
219* On entry, JB specifies B's global column index, which points
220* to the beginning of the submatrix sub( B ).
221*
222* DESCB (global and local input) INTEGER array
223* On entry, DESCB is an integer array of dimension DLEN_. This
224* is the array descriptor for the matrix B.
225*
226* BETA (global input) pointer to CHAR
227* On entry, BETA specifies the scalar beta. When BETA is
228* supplied as zero then the local entries of the array C
229* corresponding to the entries of the submatrix sub( C ) need
230* not be set on input.
231*
232* C (local input/local output) pointer to CHAR
233* On entry, C is an array of dimension (LLD_C, Kc), where Kc is
234* at least Lc( 1, JC+N-1 ). Before entry, this array contains
235* the local entries of the matrix C.
236* On exit, the entries of this array corresponding to the local
237* entries of the submatrix sub( C ) are overwritten by the
238* local entries of the m by n updated submatrix.
239*
240* IC (global input) INTEGER
241* On entry, IC specifies C's global row index, which points to
242* the beginning of the submatrix sub( C ).
243*
244* JC (global input) INTEGER
245* On entry, JC specifies C's global column index, which points
246* to the beginning of the submatrix sub( C ).
247*
248* DESCC (global and local input) INTEGER array
249* On entry, DESCC is an integer array of dimension DLEN_. This
250* is the array descriptor for the matrix C.
251*
252* -- Written on April 1, 1998 by
253* Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
254*
255* ---------------------------------------------------------------------
256*/
257/*
258* .. Local Scalars ..
259*/
260 char Aroc, GemmTa, GemmTb, TrA, TrB, * one, * talpha, * tbeta,
261 top, * zero;
262 Int Abufld, AcurrocR, Afr, Afwd, AiD, AiR, AiiD, AiiR, AinbD,
263 AinbR, Ainb1D, Ainb1R, AisR, Akk, Ald, AmyprocD, AmyprocR,
264 AnbD, AnbR, AnpD, AnpR, AnprocsD, AnprocsR, Aoff, ArocD,
265 ArocR, AsrcR, Asrc_, Bcol, Bii, Bimb1, Binb1, Bjj, Bld, Bm,
266 Bmb, Bmp, Bn, Bnb, Bnq, Brow, Cbufld, Ccol, Ccurrow, Cfr,
267 Cfwd, Cii, Cimb, Cimb1, Cinb, Cinb1, CisR, Cjj, Ckk, Cld,
268 Cmb, Cmp, Cnb, Cnq, Coff, Crow, Csrc, WAfr, WCfr, WCsum,
269 ctxt, lcmb, m, maxp, maxpm1, maxq, mb, mbb, mycol, myrow,
270 ncpq, nota, notb, npcol, npq=0, nprow, nrpq, p=0, q=0, size,
271 tmp;
272 GEMM_T gemm;
273 GSUM2D_T gsum2d;
274/*
275* .. Local Arrays ..
276*/
277 PB_VM_T VM;
278 Int Bd0[DLEN_], DBUFA[DLEN_], DBUFC[DLEN_], WAd[DLEN_],
279 WCd[DLEN_];
280 char * Abuf = NULL, * Bptr = NULL, * Cbuf = NULL, * WA = NULL,
281 * WC = NULL;
282/* ..
283* .. Executable Statements ..
284*
285*/
286 Cblacs_gridinfo( ( ctxt = DESCC[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
287
288 Afwd = ( Mupcase( DIRECA[0] ) == CFORWARD );
289 Cfwd = ( Mupcase( DIRECC[0] ) == CFORWARD );
290 nota = ( ( TrA = Mupcase( TRANSA[0] ) ) == CNOTRAN );
291 notb = ( ( TrB = Mupcase( TRANSB[0] ) ) == CNOTRAN );
292
293 size = TYPE->size; one = TYPE->one; zero = TYPE->zero;
294 gemm = TYPE->Fgemm; gsum2d = TYPE->Cgsum2d;
295 mb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) );
296/*
297* Compute local information for sub( A ), sub( B ) and sub( C )
298*/
299 if( nota )
300 {
301 AiD = JA; AiR = IA;
302 Asrc_ = RSRC_; Aroc = CROW;
303 AinbR = DESCA[IMB_ ]; AinbD = DESCA[INB_];
304 AnbR = DESCA[MB_ ]; AnbD = DESCA[NB_ ];
305 AsrcR = DESCA[Asrc_]; Ald = DESCA[LLD_];
306 AmyprocD = mycol; AnprocsD = npcol;
307 AmyprocR = myrow; AnprocsR = nprow;
308 PB_Cinfog2l( IA, JA, DESCA, AnprocsR, AnprocsD, AmyprocR, AmyprocD,
309 &AiiR, &AiiD, &ArocR, &ArocD );
310 }
311 else
312 {
313 AiD = IA; AiR = JA;
314 Asrc_ = CSRC_; Aroc = CCOLUMN;
315 AinbD = DESCA[IMB_ ]; AinbR = DESCA[INB_];
316 AnbD = DESCA[MB_ ]; AnbR = DESCA[NB_ ];
317 AsrcR = DESCA[Asrc_]; Ald = DESCA[LLD_];
318 AmyprocD = myrow; AnprocsD = nprow;
319 AmyprocR = mycol; AnprocsR = npcol;
320 PB_Cinfog2l( IA, JA, DESCA, AnprocsD, AnprocsR, AmyprocD, AmyprocR,
321 &AiiD, &AiiR, &ArocD, &ArocR );
322 }
323 Ainb1D = PB_Cfirstnb( K, AiD, AinbD, AnbD );
324 AnpD = PB_Cnumroc( K, 0, Ainb1D, AnbD, AmyprocD, ArocD, AnprocsD );
325 Ainb1R = PB_Cfirstnb( M, AiR, AinbR, AnbR );
326
327 Cimb = DESCC[IMB_ ]; Cinb = DESCC[INB_];
328 Cmb = DESCC[MB_ ]; Cnb = DESCC[NB_ ];
329 Csrc = DESCC[RSRC_]; Cld = DESCC[LLD_];
330 PB_Cinfog2l( IC, JC, DESCC, nprow, npcol, myrow, mycol, &Cii, &Cjj,
331 &Crow, &Ccol );
332 Cimb1 = PB_Cfirstnb( M, IC, Cimb, Cmb );
333 Cinb1 = PB_Cfirstnb( N, JC, Cinb, Cnb );
334 Cnq = PB_Cnumroc( N, 0, Cinb1, Cnb, mycol, Ccol, npcol );
335/*
336* Retrieve the BLACS combine topology, compute conjugate of alpha for the
337* conjugate transpose case and set the transpose parameters to be passed to
338* the BLAS matrix multiply routine.
339*/
340 if( notb )
341 {
342 Bm = K; Bn = N;
343 top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
344 talpha = ALPHA; GemmTa = ( nota ? CTRAN : TrA ); GemmTb = CNOTRAN;
345 }
346 else
347 {
348 Bm = N; Bn = K;
349 top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
350 if( TrB == CCOTRAN )
351 {
352 talpha = PB_Cmalloc( size ); PB_Cconjg( TYPE, ALPHA, talpha );
353 GemmTb = ( ( TrA == CCOTRAN ) ? CTRAN : CCOTRAN );
354 }
355 else
356 {
357 talpha = ALPHA;
358 GemmTb = ( ( TrA == CCOTRAN ) ? CCOTRAN : CTRAN );
359 }
360 GemmTa = CNOTRAN;
361 }
362/*
363* Compute descriptor Bd0 for sub( B )
364*/
365 PB_Cdescribe( Bm, Bn, IB, JB, DESCB, nprow, npcol, myrow, mycol, &Bii, &Bjj,
366 &Bld, &Bimb1, &Binb1, &Bmb, &Bnb, &Brow, &Bcol, Bd0 );
367
368 Bmp = PB_Cnumroc( Bm, 0, Bimb1, Bmb, myrow, Brow, nprow );
369 Bnq = PB_Cnumroc( Bn, 0, Binb1, Bnb, mycol, Bcol, npcol );
370 if( ( Bmp > 0 ) && ( Bnq > 0 ) ) Bptr = Mptr( B, Bii, Bjj, Bld, size );
371/*
372* When sub( A ) is not replicated and backward pass on sub( A ), find the
373* virtual process q owning the last row or column of sub( A ).
374*/
375 if( !( AisR = ( ( AsrcR < 0 ) || ( AnprocsR == 1 ) ) ) && !Afwd )
376 {
377 tmp = PB_Cindxg2p( M - 1, Ainb1R, AnbR, ArocR, ArocR, AnprocsR );
378 q = MModSub( tmp, ArocR, AnprocsR );
379 }
380/*
381* When sub( C ) is not replicated and backward pass on sub( C ), find the
382* virtual process p owning the last row or column of sub( C ).
383*/
384 if( !( CisR = ( ( Crow < 0 ) || ( nprow == 1 ) ) ) && !Cfwd )
385 {
386 tmp = PB_Cindxg2p( M - 1, Cimb1, Cmb, Crow, Crow, nprow );
387 p = MModSub( tmp, Crow, nprow );
388 }
389/*
390* Loop over the virtual process grid induced by the rows or columns of
391* sub( A ) and sub( C ).
392*/
393 lcmb = PB_Clcm( ( maxp = ( CisR ? 1 : nprow ) ) * Cmb,
394 ( maxq = ( AisR ? 1 : AnprocsR ) ) * AnbR );
395 m = M;
396 maxpm1 = maxp - 1;
397
398 while( m > 0 )
399 {
400/*
401* Initialize local virtual matrix in process (p,q)
402*/
403 AcurrocR = ( AisR ? -1 : MModAdd( ArocR, q, AnprocsR ) );
404 Akk = PB_Cg2lrem( AiR, AinbR, AnbR, AcurrocR, AsrcR, AnprocsR );
405 AnpR = PB_Cnumroc( M, 0, Ainb1R, AnbR, AcurrocR, ArocR, AnprocsR );
406
407 Ccurrow = ( CisR ? -1 : MModAdd( Crow, p, nprow ) );
408 Ckk = PB_Cg2lrem( IC, Cimb, Cmb, Ccurrow, Csrc, nprow );
409 Cmp = PB_Cnumroc( M, 0, Cimb1, Cmb, Ccurrow, Crow, nprow );
410
411 PB_CVMinit( &VM, 0, Cmp, AnpR, Cimb1, Ainb1R, Cmb, AnbR, p, q,
412 maxp, maxq, lcmb );
413/*
414* Find how many diagonals in this virtual process
415*/
416 npq = PB_CVMnpq( &VM );
417
418 m -= npq;
419/*
420* Re-adjust the number of rows or columns to be (un)packed, in order to
421* average the message sizes.
422*/
423 if( npq ) mbb = npq / ( ( npq - 1 ) / mb + 1 );
424
425 while( npq )
426 {
427 mbb = MIN( mbb, npq );
428/*
429* Find out how many rows or columns of sub( A ) and sub( C ) are contiguous
430*/
431 PB_CVMcontig( &VM, &nrpq, &ncpq, &Coff, &Aoff );
432
433 if( nota )
434 {
435/*
436* Compute the descriptor DBUFA for the buffer that will contained the packed
437* columns of sub( A ).
438*/
439 if( ( Afr = ( ncpq < mbb ) ) != 0 )
440 {
441/*
442* If rows of sub( A ) are not contiguous, then allocate the buffer and
443* pack the mbb rows of sub( A ).
444*/
445 Abufld = mbb;
446 if( AisR || ( AmyprocR == AcurrocR ) )
447 {
448 Abuf = PB_Cmalloc( AnpD * mbb * size );
449 PB_CVMpack( TYPE, &VM, COLUMN, &Aroc, PACKING, NOTRAN, mbb,
450 AnpD, one, Mptr( A, Akk, AiiD, Ald, size ), Ald,
451 zero, Abuf, Abufld );
452 }
453 }
454 else
455 {
456/*
457* Otherwise, re-use sub( B ) directly.
458*/
459 Abufld = Ald;
460 if( AisR || ( AmyprocR == AcurrocR ) )
461 Abuf = Mptr( A, Akk+Aoff, AiiD, Ald, size );
462 }
463 PB_Cdescset( DBUFA, mbb, K, mbb, Ainb1D, mbb, AnbD, AcurrocR,
464 ArocD, ctxt, Abufld );
465 }
466 else
467 {
468/*
469* Compute the descriptor DBUFA for the buffer that will contained the packed
470* columns of sub( A ).
471*/
472 if( ( Afr = ( ncpq < mbb ) ) != 0 )
473 {
474/*
475* If columns of sub( A ) are not contiguous, then allocate the buffer and pack
476* the mbb columns of sub( A ).
477*/
478 Abufld = MAX( 1, AnpD );
479 if( AisR || ( AmyprocR == AcurrocR ) )
480 {
481 Abuf = PB_Cmalloc( AnpD * mbb * size );
482 PB_CVMpack( TYPE, &VM, COLUMN, &Aroc, PACKING, NOTRAN, mbb,
483 AnpD, one, Mptr( A, AiiD, Akk, Ald, size ), Ald,
484 zero, Abuf, Abufld );
485 }
486 }
487 else
488 {
489/*
490* Otherwise, re-use sub( A ) directly.
491*/
492 Abufld = Ald;
493 if( AisR || ( AmyprocR == AcurrocR ) )
494 Abuf = Mptr( A, AiiD, Akk+Aoff, Ald, size );
495 }
496 PB_Cdescset( DBUFA, K, mbb, Ainb1D, mbb, AnbD, mbb, ArocD,
497 AcurrocR, ctxt, Abufld );
498 }
499
500 if( notb )
501 {
502/*
503* Replicate this panel of rows or columns of sub( A ) over sub( B ) -> WA
504*/
505 PB_CInV( TYPE, NOCONJG, COLUMN, Bm, Bn, Bd0, mbb, Abuf, 0, 0,
506 DBUFA, &Aroc, &WA, WAd, &WAfr );
507/*
508* Allocate space for temporary results in scope of sub( B ) -> WC
509*/
510 PB_COutV( TYPE, ROW, INIT, Bm, Bn, Bd0, mbb, &WC, WCd, &WCfr,
511 &WCsum );
512/*
513* Local matrix-matrix multiply iff I own some data
514*/
515 if( Bmp > 0 && Bnq > 0 )
516 gemm( C2F_CHAR( &GemmTa ), C2F_CHAR( &GemmTb ), &mbb, &Bnq, &Bmp,
517 talpha, WA, &WAd[LLD_], Bptr, &Bld, zero, WC, &WCd[LLD_] );
518 if( WAfr ) free( WA );
519 if( Afr && ( AisR || ( AmyprocR == AcurrocR ) ) )
520 if( Abuf ) free( Abuf );
521/*
522* Accumulate the intermediate results in WC
523*/
524 if( WCsum )
525 {
526 WCd[RSRC_] = Ccurrow;
527 if( Bnq > 0 )
528 gsum2d( ctxt, COLUMN, &top, mbb, Bnq, WC, WCd[LLD_],
529 WCd[RSRC_], mycol );
530 }
531/*
532* Compute the descriptor DBUFC for the buffer that will contained the packed
533* rows of sub( C ). Allocate it.
534*/
535 if( ( Cfr = ( nrpq < mbb ) ) != 0 )
536 {
537/*
538* If rows of sub( C ) are not contiguous, then allocate the buffer
539*/
540 Cbufld = mbb; tbeta = zero;
541 if( CisR || ( myrow == Ccurrow ) )
542 Cbuf = PB_Cmalloc( Cnq * mbb * size );
543 }
544 else
545 {
546/*
547* Otherwise re-use sub( C )
548*/
549 Cbufld = Cld; tbeta = BETA;
550 if( CisR || ( myrow == Ccurrow ) )
551 Cbuf = Mptr( C, Ckk+Coff, Cjj, Cld, size );
552 }
553 PB_Cdescset( DBUFC, mbb, N, mbb, Cinb1, mbb, Cnb, Ccurrow, Ccol,
554 ctxt, Cbufld );
555/*
556* Cbuf := Cbuf + WC
557*/
558 PB_Cpaxpby( TYPE, NOCONJG, mbb, N, one, WC, 0, 0, WCd, ROW, tbeta,
559 Cbuf, 0, 0, DBUFC, ROW );
560/*
561* Unpack the mbb rows of sub( C ) and release the buffer containing them.
562*/
563 if( Cfr && ( CisR || ( myrow == Ccurrow ) ) )
564 {
565 PB_CVMpack( TYPE, &VM, ROW, ROW, UNPACKING, NOTRAN, mbb, Cnq,
566 BETA, Mptr( C, Ckk, Cjj, Cld, size ), Cld, one, Cbuf,
567 Cbufld );
568 if( Cbuf ) free( Cbuf );
569 }
570 if( WCfr ) free( WC );
571 }
572 else
573 {
574/*
575* Replicate this panel of rows or columns of sub( A ) over sub( B ) -> WA
576*/
577 PB_CInV( TYPE, NOCONJG, ROW, Bm, Bn, Bd0, mbb, Abuf, 0, 0,
578 DBUFA, &Aroc, &WA, WAd, &WAfr );
579/*
580* Allocate space for temporary results in scope of sub( A ) -> WC
581*/
582 PB_COutV( TYPE, COLUMN, INIT, Bm, Bn, Bd0, mbb, &WC, WCd, &WCfr,
583 &WCsum );
584/*
585* Local matrix-matrix multiply iff I own some data
586*/
587 if( Bmp > 0 && Bnq > 0 )
588 gemm( C2F_CHAR( &GemmTa ), C2F_CHAR( &GemmTb ), &Bmp, &mbb, &Bnq,
589 talpha, Bptr, &Bld, WA, &WAd[LLD_], zero, WC, &WCd[LLD_] );
590 if( WAfr ) free( WA );
591 if( Afr && ( AisR || ( AmyprocR == AcurrocR ) ) )
592 if( Abuf ) free( Abuf );
593/*
594* Accumulate the intermediate results in WC
595*/
596 if( WCsum )
597 {
598 WCd[CSRC_] = 0;
599 if( Bmp > 0 )
600 gsum2d( ctxt, ROW, &top, Bmp, mbb, WC, WCd[LLD_], myrow,
601 WCd[CSRC_] );
602 }
603/*
604* Compute the descriptor DBUFC for the buffer that will contained the packed
605* rows of sub( C ). Allocate it.
606*/
607 if( ( Cfr = ( nrpq < mbb ) ) != 0 )
608 {
609/*
610* If rows of sub( C ) are not contiguous, then allocate the buffer
611*/
612 Cbufld = mbb; tbeta = zero;
613 if( CisR || ( myrow == Ccurrow ) )
614 Cbuf = PB_Cmalloc( Cnq * mbb * size );
615 }
616 else
617 {
618/*
619* Otherwise re-use sub( C )
620*/
621 Cbufld = Cld; tbeta = BETA;
622 if( CisR || ( myrow == Ccurrow ) )
623 Cbuf = Mptr( C, Ckk+Coff, Cjj, Cld, size );
624 }
625 PB_Cdescset( DBUFC, mbb, N, mbb, Cinb1, mbb, Cnb, Ccurrow, Ccol,
626 ctxt, Cbufld );
627/*
628* Cbuf := Cbuf + WC'
629*/
630 PB_Cpaxpby( TYPE, ( TrB == CCOTRAN ? CONJG : NOCONJG ), N, mbb,
631 one, WC, 0, 0, WCd, COLUMN, tbeta, Cbuf, 0, 0, DBUFC,
632 ROW );
633/*
634* Unpack the mbb rows of sub( C ) and release the buffer containing them.
635*/
636 if( Cfr && ( CisR || ( myrow == Ccurrow ) ) )
637 {
638 PB_CVMpack( TYPE, &VM, ROW, ROW, UNPACKING, NOTRAN, mbb, Cnq,
639 BETA, Mptr( C, Ckk, Cjj, Cld, size ), Cld, one, Cbuf,
640 Cbufld );
641 if( Cbuf ) free( Cbuf );
642 }
643 if( WCfr ) free( WC );
644 }
645/*
646* Update the local indexes of sub( B ) and sub( C )
647*/
648 PB_CVMupdate( &VM, mbb, &Ckk, &Akk );
649
650 npq -= mbb;
651 }
652/*
653* Go to next or previous virtual process row or column
654*/
655 if( ( Cfwd && ( p == maxpm1 ) ) ||
656 ( !( Cfwd ) && ( p == 0 ) ) )
657 q = ( Afwd ? MModAdd1( q, maxq ) : MModSub1( q, maxq ) );
658 p = ( Cfwd ? MModAdd1( p, maxp ) : MModSub1( p, maxp ) );
659 }
660
661 if( TrB == CCOTRAN ) free( talpha );
662/*
663* End of PB_CpgemmAC
664*/
665}
#define Int
Definition Bconfig.h:22
F_VOID_FCT(* GEMM_T)()
Definition pblas.h:317
void(* GSUM2D_T)()
Definition pblas.h:286
#define C2F_CHAR(a)
Definition pblas.h:125
#define CCOLUMN
Definition PBblacs.h:20
#define TOP_GET
Definition PBblacs.h:50
#define COLUMN
Definition PBblacs.h:45
#define COMBINE
Definition PBblacs.h:49
#define CROW
Definition PBblacs.h:21
#define ROW
Definition PBblacs.h:46
void Cblacs_gridinfo()
#define NOTRAN
Definition PBblas.h:44
#define CONJG
Definition PBblas.h:47
#define NOCONJG
Definition PBblas.h:45
#define CNOTRAN
Definition PBblas.h:18
#define CTRAN
Definition PBblas.h:20
#define CCOTRAN
Definition PBblas.h:22
#define INIT
Definition PBblas.h:61
#define CFORWARD
Definition PBblas.h:38
#define pilaenv_
Definition PBpblas.h:44
#define CTXT_
Definition PBtools.h:38
#define UNPACKING
Definition PBtools.h:54
void PB_CVMinit()
Int PB_Cfirstnb()
#define MAX(a_, b_)
Definition PBtools.h:77
#define MB_
Definition PBtools.h:43
char * PB_Cmalloc()
void PB_Cinfog2l()
#define MModSub(I1, I2, d)
Definition PBtools.h:102
#define PACKING
Definition PBtools.h:53
#define MIN(a_, b_)
Definition PBtools.h:76
#define Mptr(a_, i_, j_, lda_, siz_)
Definition PBtools.h:132
#define LLD_
Definition PBtools.h:47
Int PB_Cnumroc()
Int PB_CVMpack()
char * PB_Ctop()
void PB_CInV()
void PB_CVMupdate()
#define RSRC_
Definition PBtools.h:45
void PB_Cdescset()
void PB_COutV()
#define MModAdd1(I, d)
Definition PBtools.h:100
#define MModAdd(I1, I2, d)
Definition PBtools.h:97
#define INB_
Definition PBtools.h:42
Int PB_CVMnpq()
#define MModSub1(I, d)
Definition PBtools.h:105
#define CSRC_
Definition PBtools.h:46
Int PB_Clcm()
#define IMB_
Definition PBtools.h:41
void PB_CpgemmAC()
Int PB_Cindxg2p()
Int PB_Cg2lrem()
#define Mupcase(C)
Definition PBtools.h:83
#define DLEN_
Definition PBtools.h:48
#define NB_
Definition PBtools.h:44
void PB_Cconjg()
void PB_CVMcontig()
void PB_Cpaxpby()
void PB_Cdescribe()
#define TYPE
Definition clamov.c:7