SCALAPACK 2.2.2
LAPACK: Linear Algebra PACKage
Loading...
Searching...
No Matches
PB_CpgemmBC.c
Go to the documentation of this file.
1/* ---------------------------------------------------------------------
2*
3* -- PBLAS auxiliary routine (version 2.0) --
4* University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5* and University of California, Berkeley.
6* April 1, 1998
7*
8* ---------------------------------------------------------------------
9*/
10/*
11* Include files
12*/
13#include "../pblas.h"
14#include "../PBpblas.h"
15#include "../PBtools.h"
16#include "../PBblacs.h"
17#include "../PBblas.h"
18
19#ifdef __STDC__
20void PB_CpgemmBC( PBTYP_T * TYPE, char * DIRECB, char * DIRECC,
21 char * TRANSA, char * TRANSB, Int M, Int N, Int K,
22 char * ALPHA, char * A, Int IA, Int JA, Int * DESCA,
23 char * B, Int IB, Int JB, Int * DESCB, char * BETA,
24 char * C, Int IC, Int JC, Int * DESCC )
25#else
26void PB_CpgemmBC( TYPE, DIRECB, DIRECC, TRANSA, TRANSB, M, N, K, ALPHA,
27 A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC,
28 DESCC )
29/*
30* .. Scalar Arguments ..
31*/
32 char * DIRECB, * DIRECC, * TRANSA, * TRANSB;
33 Int IA, IB, IC, JA, JB, JC, K, M, N;
34 char * ALPHA, * BETA;
35 PBTYP_T * TYPE;
36/*
37* .. Array Arguments ..
38*/
39 Int * DESCA, * DESCB, * DESCC;
40 char * A, * B, * C;
41#endif
42{
43/*
44* Purpose
45* =======
46*
47* PB_CpgemmBC performs one of the matrix-matrix operations
48*
49* sub( C ) := alpha*op( sub( A ) )*op( sub( B ) ) + beta*sub( C ),
50*
51* where
52*
53* sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), and, op( X ) is one of
54* op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ).
55*
56* Thus, op( sub( A ) ) denotes A(IA:IA+M-1,JA:JA+K-1) if TRANSA = 'N',
57* A(IA:IA+K-1,JA:JA+M-1)' if TRANSA = 'T',
58* conjg(A(IA:IA+K-1,JA:JA+M-1)') if TRANSA = 'C',
59*
60* and, op( sub( B ) ) denotes B(IB:IB+K-1,JB:JB+N-1) if TRANSB = 'N',
61* B(IB:IB+N-1,JB:JB+K-1)' if TRANSB = 'T',
62* conjg(B(IB:IB+N-1,JB:JB+K-1)') if TRANSB = 'C'.
63*
64* Alpha and beta are scalars. A, B and C are matrices; op( sub( A ) )
65* is an m by k submatrix, op( sub( B ) ) is an k by n submatrix and
66* sub( C ) is an m by n submatrix.
67*
68* This is the inner-product algorithm using the logical LCM algorithmic
69* blocking technique. The submatrix operand sub( A ) stays in place.
70*
71* Notes
72* =====
73*
74* A description vector is associated with each 2D block-cyclicly dis-
75* tributed matrix. This vector stores the information required to
76* establish the mapping between a matrix entry and its corresponding
77* process and memory location.
78*
79* In the following comments, the character _ should be read as
80* "of the distributed matrix". Let A be a generic term for any 2D
81* block cyclicly distributed matrix. Its description vector is DESC_A:
82*
83* NOTATION STORED IN EXPLANATION
84* ---------------- --------------- ------------------------------------
85* DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
86* CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
87* the NPROW x NPCOL BLACS process grid
88* A is distributed over. The context
89* itself is global, but the handle
90* (the integer value) may vary.
91* M_A (global) DESCA[ M_ ] The number of rows in the distribu-
92* ted matrix A, M_A >= 0.
93* N_A (global) DESCA[ N_ ] The number of columns in the distri-
94* buted matrix A, N_A >= 0.
95* IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
96* block of the matrix A, IMB_A > 0.
97* INB_A (global) DESCA[ INB_ ] The number of columns of the upper
98* left block of the matrix A,
99* INB_A > 0.
100* MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
101* bute the last M_A-IMB_A rows of A,
102* MB_A > 0.
103* NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
104* bute the last N_A-INB_A columns of
105* A, NB_A > 0.
106* RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
107* row of the matrix A is distributed,
108* NPROW > RSRC_A >= 0.
109* CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
110* first column of A is distributed.
111* NPCOL > CSRC_A >= 0.
112* LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
113* array storing the local blocks of
114* the distributed matrix A,
115* IF( Lc( 1, N_A ) > 0 )
116* LLD_A >= MAX( 1, Lr( 1, M_A ) )
117* ELSE
118* LLD_A >= 1.
119*
120* Let K be the number of rows of a matrix A starting at the global in-
121* dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
122* that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
123* receive if these K rows were distributed over NPROW processes. If K
124* is the number of columns of a matrix A starting at the global index
125* JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
126* lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
127* these K columns were distributed over NPCOL processes.
128*
129* The values of Lr() and Lc() may be determined via a call to the func-
130* tion PB_Cnumroc:
131* Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
132* Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
133*
134* Arguments
135* =========
136*
137* TYPE (local input) pointer to a PBTYP_T structure
138* On entry, TYPE is a pointer to a structure of type PBTYP_T,
139* that contains type information (See pblas.h).
140*
141* DIRECB (global input) pointer to CHAR
142* On entry, DIRECB specifies the direction in which the rows
143* or columns of sub( B ) should be looped over as follows:
144* DIRECB = 'F' or 'f' forward or increasing,
145* DIRECB = 'B' or 'b' backward or decreasing.
146*
147* DIRECC (global input) pointer to CHAR
148* On entry, DIRECC specifies the direction in which the rows
149* or columns of sub( C ) should be looped over as follows:
150* DIRECC = 'F' or 'f' forward or increasing,
151* DIRECC = 'B' or 'b' backward or decreasing.
152*
153* TRANSA (global input) pointer to CHAR
154* On entry, TRANSA specifies the form of op( sub( A ) ) to be
155* used in the matrix multiplication as follows:
156*
157* TRANSA = 'N' or 'n' op( sub( A ) ) = sub( A ),
158* TRANSA = 'T' or 't' op( sub( A ) ) = sub( A )',
159* TRANSA = 'C' or 'c' op( sub( A ) ) = conjg( sub( A )' ).
160*
161* TRANSB (global input) pointer to CHAR
162* On entry, TRANSB specifies the form of op( sub( B ) ) to be
163* used in the matrix multiplication as follows:
164*
165* TRANSB = 'N' or 'n' op( sub( B ) ) = sub( B ),
166* TRANSB = 'T' or 't' op( sub( B ) ) = sub( B )',
167* TRANSB = 'C' or 'c' op( sub( B ) ) = conjg( sub( B )' ).
168*
169* M (global input) INTEGER
170* On entry, M specifies the number of rows of the submatrix
171* op( sub( A ) ) and of the submatrix sub( C ). M must be at
172* least zero.
173*
174* N (global input) INTEGER
175* On entry, N specifies the number of columns of the submatrix
176* op( sub( B ) ) and the number of columns of the submatrix
177* sub( C ). N must be at least zero.
178*
179* K (global input) INTEGER
180* On entry, K specifies the number of columns of the submatrix
181* op( sub( A ) ) and the number of rows of the submatrix
182* op( sub( B ) ). K must be at least zero.
183*
184* ALPHA (global input) pointer to CHAR
185* On entry, ALPHA specifies the scalar alpha. When ALPHA is
186* supplied as zero then the local entries of the arrays A and
187* B corresponding to the entries of the submatrices sub( A )
188* and sub( B ) respectively need not be set on input.
189*
190* A (local input) pointer to CHAR
191* On entry, A is an array of dimension (LLD_A, Ka), where Ka is
192* at least Lc( 1, JA+K-1 ) when TRANSA = 'N' or 'n', and is at
193* least Lc( 1, JA+M-1 ) otherwise. Before entry, this array
194* contains the local entries of the matrix A.
195*
196* IA (global input) INTEGER
197* On entry, IA specifies A's global row index, which points to
198* the beginning of the submatrix sub( A ).
199*
200* JA (global input) INTEGER
201* On entry, JA specifies A's global column index, which points
202* to the beginning of the submatrix sub( A ).
203*
204* DESCA (global and local input) INTEGER array
205* On entry, DESCA is an integer array of dimension DLEN_. This
206* is the array descriptor for the matrix A.
207*
208* B (local input) pointer to CHAR
209* On entry, B is an array of dimension (LLD_B, Kb), where Kb is
210* at least Lc( 1, JB+N-1 ) when TRANSB = 'N' or 'n', and is at
211* least Lc( 1, JB+K-1 ) otherwise. Before entry, this array
212* contains the local entries of the matrix B.
213*
214* IB (global input) INTEGER
215* On entry, IB specifies B's global row index, which points to
216* the beginning of the submatrix sub( B ).
217*
218* JB (global input) INTEGER
219* On entry, JB specifies B's global column index, which points
220* to the beginning of the submatrix sub( B ).
221*
222* DESCB (global and local input) INTEGER array
223* On entry, DESCB is an integer array of dimension DLEN_. This
224* is the array descriptor for the matrix B.
225*
226* BETA (global input) pointer to CHAR
227* On entry, BETA specifies the scalar beta. When BETA is
228* supplied as zero then the local entries of the array C
229* corresponding to the entries of the submatrix sub( C ) need
230* not be set on input.
231*
232* C (local input/local output) pointer to CHAR
233* On entry, C is an array of dimension (LLD_C, Kc), where Kc is
234* at least Lc( 1, JC+N-1 ). Before entry, this array contains
235* the local entries of the matrix C.
236* On exit, the entries of this array corresponding to the local
237* entries of the submatrix sub( C ) are overwritten by the
238* local entries of the m by n updated submatrix.
239*
240* IC (global input) INTEGER
241* On entry, IC specifies C's global row index, which points to
242* the beginning of the submatrix sub( C ).
243*
244* JC (global input) INTEGER
245* On entry, JC specifies C's global column index, which points
246* to the beginning of the submatrix sub( C ).
247*
248* DESCC (global and local input) INTEGER array
249* On entry, DESCC is an integer array of dimension DLEN_. This
250* is the array descriptor for the matrix C.
251*
252* -- Written on April 1, 1998 by
253* Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
254*
255* ---------------------------------------------------------------------
256*/
257/*
258* .. Local Scalars ..
259*/
260 char Broc, GemmTa, GemmTb, TrA, TrB, * one, * talpha, * tbeta,
261 top, * zero;
262 Int Acol, Aii, Aimb1, Ainb1, Ajj, Ald, Am, Amb, Amp, An, Anb,
263 Anq, Arow, Bbufld, BcurrocR, Bfr, Bfwd, BiD, BiR, BiiD, BiiR,
264 BinbD, BinbR, Binb1D, Binb1R, BisR, Bkk, Bld, BmyprocD,
265 BmyprocR, BnbD, BnbR, BnpD, BnpR, BnprocsD, BnprocsR, Boff,
266 BrocD, BrocR, BsrcR, Bsrc_, Cbufld, Ccol, Ccurcol, Cfr, Cfwd,
267 Cii, Cimb, Cimb1, Cinb, Cinb1, CisR, Cjj, Ckk, Cld, Cmb, Cmp,
268 Cnb, Cnq, Coff, Crow, Csrc, WBfr, WCfr, WCsum, ctxt, lcmb,
269 maxp, maxpm1, maxq, mycol, myrow, n, nb, nbb, ncpq, nota,
270 notb, npcol, npq=0, nprow, nrpq, p=0, q=0, size, tmp;
271 GEMM_T gemm;
272 GSUM2D_T gsum2d;
273/*
274* .. Local Arrays ..
275*/
276 Int Ad0[DLEN_], DBUFB[DLEN_], DBUFC[DLEN_], WBd[DLEN_],
277 WCd[DLEN_];
278 PB_VM_T VM;
279 char * Aptr = NULL, * Bbuf = NULL, * Cbuf = NULL, * WB = NULL,
280 * WC = NULL;
281/* ..
282* .. Executable Statements ..
283*
284*/
285 Cblacs_gridinfo( ( ctxt = DESCC[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
286
287 Bfwd = ( Mupcase( DIRECB[0] ) == CFORWARD );
288 Cfwd = ( Mupcase( DIRECC[0] ) == CFORWARD );
289 nota = ( ( TrA = Mupcase( TRANSA[0] ) ) == CNOTRAN );
290 notb = ( ( TrB = Mupcase( TRANSB[0] ) ) == CNOTRAN );
291
292 size = TYPE->size; one = TYPE->one; zero = TYPE->zero;
293 gemm = TYPE->Fgemm; gsum2d = TYPE->Cgsum2d;
294 nb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) );
295/*
296* Compute local information for sub( A ), sub( B ) and sub( C )
297*/
298 if( notb )
299 {
300 BiD = IB; BiR = JB;
301 Bsrc_ = CSRC_; Broc = CCOLUMN;
302 BinbD = DESCB[IMB_ ]; BinbR = DESCB[INB_];
303 BnbD = DESCB[MB_ ]; BnbR = DESCB[NB_ ];
304 BsrcR = DESCB[Bsrc_]; Bld = DESCB[LLD_];
305 BmyprocD = myrow; BnprocsD = nprow;
306 BmyprocR = mycol; BnprocsR = npcol;
307 PB_Cinfog2l( IB, JB, DESCB, BnprocsD, BnprocsR, BmyprocD, BmyprocR,
308 &BiiD, &BiiR, &BrocD, &BrocR );
309 }
310 else
311 {
312 BiD = JB; BiR = IB;
313 Bsrc_ = RSRC_; Broc = CROW;
314 BinbR = DESCB[IMB_ ]; BinbD = DESCB[INB_];
315 BnbR = DESCB[MB_ ]; BnbD = DESCB[NB_ ];
316 BsrcR = DESCB[Bsrc_]; Bld = DESCB[LLD_];
317 BmyprocD = mycol; BnprocsD = npcol;
318 BmyprocR = myrow; BnprocsR = nprow;
319 PB_Cinfog2l( IB, JB, DESCB, BnprocsR, BnprocsD, BmyprocR, BmyprocD,
320 &BiiR, &BiiD, &BrocR, &BrocD );
321 }
322 Binb1D = PB_Cfirstnb( K, BiD, BinbD, BnbD );
323 BnpD = PB_Cnumroc( K, 0, Binb1D, BnbD, BmyprocD, BrocD, BnprocsD );
324 Binb1R = PB_Cfirstnb( N, BiR, BinbR, BnbR );
325
326 Cimb = DESCC[IMB_ ]; Cinb = DESCC[INB_];
327 Cmb = DESCC[MB_ ]; Cnb = DESCC[NB_ ];
328 Csrc = DESCC[CSRC_]; Cld = DESCC[LLD_];
329 PB_Cinfog2l( IC, JC, DESCC, nprow, npcol, myrow, mycol, &Cii, &Cjj,
330 &Crow, &Ccol );
331 Cimb1 = PB_Cfirstnb( M, IC, Cimb, Cmb );
332 Cmp = PB_Cnumroc( M, 0, Cimb1, Cmb, myrow, Crow, nprow );
333 Cinb1 = PB_Cfirstnb( N, JC, Cinb, Cnb );
334/*
335* Retrieve the BLACS combine topology, compute conjugate of alpha for the
336* conjugate transpose case and set the transpose parameters to be passed to
337* the BLAS matrix multiply routine.
338*/
339 if( nota )
340 {
341 Am = M; An = K;
342 top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
343 talpha = ALPHA; GemmTa = CNOTRAN; GemmTb = ( notb ? CTRAN : TrB );
344 }
345 else
346 {
347 Am = K; An = M;
348 top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
349 if( TrA == CCOTRAN )
350 {
351 talpha = PB_Cmalloc( size ); PB_Cconjg( TYPE, ALPHA, talpha );
352 GemmTa = ( ( TrB == CCOTRAN ) ? CTRAN : CCOTRAN );
353 }
354 else
355 {
356 talpha = ALPHA;
357 GemmTa = ( ( TrB == CCOTRAN ) ? CCOTRAN : CTRAN );
358 }
359 GemmTb = CNOTRAN;
360 }
361/*
362* Compute descriptor Ad0 for sub( A )
363*/
364 PB_Cdescribe( Am, An, IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj,
365 &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 );
366
367 Amp = PB_Cnumroc( Am, 0, Aimb1, Amb, myrow, Arow, nprow );
368 Anq = PB_Cnumroc( An, 0, Ainb1, Anb, mycol, Acol, npcol );
369 if( ( Amp > 0 ) && ( Anq > 0 ) ) { Aptr = Mptr( A, Aii, Ajj, Ald, size ); }
370/*
371* When sub( B ) is not replicated and backward pass on sub( B ), find the
372* virtual process q owning the last row or column of sub( B ).
373*/
374 if( !( BisR = ( ( BsrcR < 0 ) || ( BnprocsR == 1 ) ) ) && !Bfwd )
375 {
376 tmp = PB_Cindxg2p( N - 1, Binb1R, BnbR, BrocR, BrocR, BnprocsR );
377 q = MModSub( tmp, BrocR, BnprocsR );
378 }
379/*
380* When sub( C ) is not replicated and backward pass on sub( C ), find the
381* virtual process p owning the last row or column of sub( C ).
382*/
383 if( !( CisR = ( ( Ccol < 0 ) || ( npcol == 1 ) ) ) && !Cfwd )
384 {
385 tmp = PB_Cindxg2p( N - 1, Cinb1, Cnb, Ccol, Ccol, npcol );
386 p = MModSub( tmp, Ccol, npcol );
387 }
388/*
389* Loop over the virtual process grid induced by the rows or columns of
390* sub( B ) and sub( C ).
391*/
392 lcmb = PB_Clcm( ( maxp = ( CisR ? 1 : npcol ) ) * Cnb,
393 ( maxq = ( BisR ? 1 : BnprocsR ) ) * BnbR );
394 n = N;
395 maxpm1 = maxp - 1;
396
397 while( n > 0 )
398 {
399/*
400* Initialize local virtual matrix in process (p,q)
401*/
402 BcurrocR = ( BisR ? -1 : MModAdd( BrocR, q, BnprocsR ) );
403 Bkk = PB_Cg2lrem( BiR, BinbR, BnbR, BcurrocR, BsrcR, BnprocsR );
404 BnpR = PB_Cnumroc( N, 0, Binb1R, BnbR, BcurrocR, BrocR, BnprocsR );
405
406 Ccurcol = ( CisR ? -1 : MModAdd( Ccol, p, npcol ) );
407 Ckk = PB_Cg2lrem( JC, Cinb, Cnb, Ccurcol, Csrc, npcol );
408 Cnq = PB_Cnumroc( N, 0, Cinb1, Cnb, Ccurcol, Ccol, npcol );
409
410 PB_CVMinit( &VM, 0, Cnq, BnpR, Cinb1, Binb1R, Cnb, BnbR, p, q,
411 maxp, maxq, lcmb );
412/*
413* Find how many diagonals in this virtual process
414*/
415 npq = PB_CVMnpq( &VM );
416
417 n -= npq;
418/*
419* Re-adjust the number of rows or columns to be (un)packed, in order to
420* average the message sizes.
421*/
422 if( npq ) nbb = npq / ( ( npq - 1 ) / nb + 1 );
423
424 while( npq )
425 {
426 nbb = MIN( nbb, npq );
427/*
428* Find out how many rows or columns of sub( B ) and sub( C ) are contiguous
429*/
430 PB_CVMcontig( &VM, &nrpq, &ncpq, &Coff, &Boff );
431
432 if( notb )
433 {
434/*
435* Compute the descriptor DBUFB for the buffer that will contained the packed
436* columns of sub( B ).
437*/
438 if( ( Bfr = ( ncpq < nbb ) ) != 0 )
439 {
440/*
441* If columns of sub( B ) are not contiguous, then allocate the buffer and
442* pack the nbb columns of sub( B ).
443*/
444 Bbufld = MAX( 1, BnpD );
445 if( BisR || ( BmyprocR == BcurrocR ) )
446 {
447 Bbuf = PB_Cmalloc( BnpD * nbb * size );
448 PB_CVMpack( TYPE, &VM, COLUMN, &Broc, PACKING, NOTRAN, nbb,
449 BnpD, one, Mptr( B, BiiD, Bkk, Bld, size ), Bld,
450 zero, Bbuf, Bbufld );
451 }
452 }
453 else
454 {
455/*
456* Otherwise, re-use sub( B ) directly.
457*/
458 Bbufld = Bld;
459 if( BisR || ( BmyprocR == BcurrocR ) )
460 Bbuf = Mptr( B, BiiD, Bkk+Boff, Bld, size );
461 }
462 PB_Cdescset( DBUFB, K, nbb, Binb1D, nbb, BnbD, nbb, BrocD,
463 BcurrocR, ctxt, Bbufld );
464 }
465 else
466 {
467/*
468* Compute the descriptor DBUFB for the buffer that will contained the packed
469* rows of sub( B ).
470*/
471 if( ( Bfr = ( ncpq < nbb ) ) != 0 )
472 {
473/*
474* If rows of sub( B ) are not contiguous, then allocate the buffer and pack
475* the nbb rows of sub( B ).
476*/
477 Bbufld = nbb;
478 if( BisR || ( BmyprocR == BcurrocR ) )
479 {
480 Bbuf = PB_Cmalloc( BnpD * nbb * size );
481 PB_CVMpack( TYPE, &VM, COLUMN, &Broc, PACKING, NOTRAN, nbb,
482 BnpD, one, Mptr( B, Bkk, BiiD, Bld, size ), Bld,
483 zero, Bbuf, Bbufld );
484 }
485 }
486 else
487 {
488/*
489* Otherwise, re-use sub( B ) directly.
490*/
491 Bbufld = Bld;
492 if( BisR || ( BmyprocR == BcurrocR ) )
493 Bbuf = Mptr( B, Bkk+Boff, BiiD, Bld, size );
494 }
495 PB_Cdescset( DBUFB, nbb, K, nbb, Binb1D, nbb, BnbD, BcurrocR,
496 BrocD, ctxt, Bbufld );
497 }
498
499 if( nota )
500 {
501/*
502* Replicate this panel of rows or columns of sub( B ) over sub( A ) -> WB
503*/
504 PB_CInV( TYPE, NOCONJG, ROW, Am, An, Ad0, nbb, Bbuf, 0, 0,
505 DBUFB, &Broc, &WB, WBd, &WBfr );
506/*
507* Allocate space for temporary results in scope of sub( A ) -> WC
508*/
509 PB_COutV( TYPE, COLUMN, INIT, Am, An, Ad0, nbb, &WC, WCd, &WCfr,
510 &WCsum );
511/*
512* Local matrix-matrix multiply iff I own some data
513*/
514 if( Amp > 0 && Anq > 0 )
515 gemm( C2F_CHAR( &GemmTa ), C2F_CHAR( &GemmTb ), &Amp, &nbb,
516 &Anq, talpha, Aptr, &Ald, WB, &WBd[LLD_], zero,
517 WC, &WCd[LLD_] );
518 if( WBfr ) free( WB );
519 if( Bfr && ( BisR || ( BmyprocR == BcurrocR ) ) )
520 if( Bbuf ) free( Bbuf );
521/*
522* Accumulate the intermediate results in WC
523*/
524 if( WCsum )
525 {
526 WCd[CSRC_] = Ccurcol;
527 if( Amp > 0 )
528 gsum2d( ctxt, ROW, &top, Amp, nbb, WC, WCd[LLD_], myrow,
529 WCd[CSRC_] );
530 }
531/*
532* Compute the descriptor DBUFC for the buffer that will contained the packed
533* columns of sub( C ). Allocate it.
534*/
535 if( ( Cfr = ( nrpq < nbb ) ) != 0 )
536 {
537/*
538* If columns of sub( C ) are not contiguous, then allocate the buffer
539*/
540 Cbufld = MAX( 1, Cmp ); tbeta = zero;
541 if( CisR || ( mycol == Ccurcol ) )
542 Cbuf = PB_Cmalloc( Cmp * nbb * size );
543 }
544 else
545 {
546/*
547* Otherwise re-use sub( C )
548*/
549 Cbufld = Cld; tbeta = BETA;
550 if( CisR || ( mycol == Ccurcol ) )
551 Cbuf = Mptr( C, Cii, Ckk+Coff, Cld, size );
552 }
553 PB_Cdescset( DBUFC, M, nbb, Cimb1, nbb, Cmb, nbb, Crow, Ccurcol,
554 ctxt, Cbufld );
555/*
556* Cbuf := Cbuf + WC
557*/
558 PB_Cpaxpby( TYPE, NOCONJG, M, nbb, one, WC, 0, 0, WCd, COLUMN,
559 tbeta, Cbuf, 0, 0, DBUFC, COLUMN );
560/*
561* Unpack the nbb columns of sub( C ) and release the buffer containing them.
562*/
563 if( Cfr && ( CisR || ( mycol == Ccurcol ) ) )
564 {
565 PB_CVMpack( TYPE, &VM, ROW, COLUMN, UNPACKING, NOTRAN, nbb, Cmp,
566 BETA, Mptr( C, Cii, Ckk, Cld, size ), Cld, one, Cbuf,
567 Cbufld );
568 if( Cbuf ) free( Cbuf );
569 }
570 if( WCfr ) free( WC );
571 }
572 else
573 {
574/*
575* Replicate this panel of rows or columns of sub( B ) over sub( A ) -> WB
576*/
577 PB_CInV( TYPE, NOCONJG, COLUMN, Am, An, Ad0, nbb, Bbuf, 0, 0,
578 DBUFB, &Broc, &WB, WBd, &WBfr );
579/*
580* Allocate space for temporary results in scope of sub( A ) -> WC
581*/
582 PB_COutV( TYPE, ROW, INIT, Am, An, Ad0, nbb, &WC, WCd, &WCfr,
583 &WCsum );
584/*
585* Local matrix-matrix multiply iff I own some data
586*/
587 if( Amp > 0 && Anq > 0 )
588 gemm( C2F_CHAR( &GemmTa ), C2F_CHAR( &GemmTb ), &nbb, &Anq,
589 &Amp, talpha, WB, &WBd[LLD_], Aptr, &Ald, zero, WC,
590 &WCd[LLD_] );
591 if( WBfr ) free( WB );
592 if( Bfr && ( BisR || ( BmyprocR == BcurrocR ) ) )
593 if( Bbuf ) free( Bbuf );
594/*
595* Accumulate the intermediate results in WC
596*/
597 if( WCsum )
598 {
599 WCd[RSRC_] = 0;
600 if( Anq > 0 )
601 gsum2d( ctxt, COLUMN, &top, nbb, Anq, WC, WCd[LLD_],
602 WCd[RSRC_], mycol );
603 }
604/*
605* Compute the descriptor DBUFC for the buffer that will contained the packed
606* columns of sub( C ). Allocate it.
607*/
608 if( ( Cfr = ( nrpq < nbb ) ) != 0 )
609 {
610/*
611* If columns of sub( C ) are not contiguous, then allocate the buffer
612*/
613 Cbufld = MAX( 1, Cmp ); tbeta = zero;
614 if( CisR || ( mycol == Ccurcol ) )
615 Cbuf = PB_Cmalloc( Cmp * nbb * size );
616 }
617 else
618 {
619/*
620* Otherwise re-use sub( C )
621*/
622 Cbufld = Cld; tbeta = BETA;
623 if( CisR || ( mycol == Ccurcol ) )
624 Cbuf = Mptr( C, Cii, Ckk+Coff, Cld, size );
625 }
626 PB_Cdescset( DBUFC, M, nbb, Cimb1, nbb, Cmb, nbb, Crow, Ccurcol,
627 ctxt, Cbufld );
628/*
629* Cbuf := Cbuf + WC'
630*/
631 PB_Cpaxpby( TYPE, ( TrA == CCOTRAN ? CONJG : NOCONJG ), nbb, M,
632 one, WC, 0, 0, WCd, ROW, tbeta, Cbuf, 0, 0, DBUFC,
633 COLUMN );
634/*
635* Unpack the nbb columns of sub( C ) and release the buffer containing them.
636*/
637 if( Cfr && ( CisR || ( mycol == Ccurcol ) ) )
638 {
639 PB_CVMpack( TYPE, &VM, ROW, COLUMN, UNPACKING, NOTRAN, nbb, Cmp,
640 BETA, Mptr( C, Cii, Ckk, Cld, size ), Cld, one, Cbuf,
641 Cbufld );
642 if( Cbuf ) free( Cbuf );
643 }
644 if( WCfr ) free( WC );
645 }
646/*
647* Update the local indexes of sub( B ) and sub( C )
648*/
649 PB_CVMupdate( &VM, nbb, &Ckk, &Bkk );
650
651 npq -= nbb;
652 }
653/*
654* Go to next or previous virtual process row or column
655*/
656 if( ( Cfwd && ( p == maxpm1 ) ) ||
657 ( !( Cfwd ) && ( p == 0 ) ) )
658 q = ( Bfwd ? MModAdd1( q, maxq ) : MModSub1( q, maxq ) );
659 p = ( Cfwd ? MModAdd1( p, maxp ) : MModSub1( p, maxp ) );
660 }
661
662 if( TrA == CCOTRAN ) free( talpha );
663/*
664* End of PB_CpgemmBC
665*/
666}
#define Int
Definition Bconfig.h:22
F_VOID_FCT(* GEMM_T)()
Definition pblas.h:317
void(* GSUM2D_T)()
Definition pblas.h:286
#define C2F_CHAR(a)
Definition pblas.h:125
#define CCOLUMN
Definition PBblacs.h:20
#define TOP_GET
Definition PBblacs.h:50
#define COLUMN
Definition PBblacs.h:45
#define COMBINE
Definition PBblacs.h:49
#define CROW
Definition PBblacs.h:21
#define ROW
Definition PBblacs.h:46
void Cblacs_gridinfo()
#define NOTRAN
Definition PBblas.h:44
#define CONJG
Definition PBblas.h:47
#define NOCONJG
Definition PBblas.h:45
#define CNOTRAN
Definition PBblas.h:18
#define CTRAN
Definition PBblas.h:20
#define CCOTRAN
Definition PBblas.h:22
#define INIT
Definition PBblas.h:61
#define CFORWARD
Definition PBblas.h:38
#define pilaenv_
Definition PBpblas.h:44
#define CTXT_
Definition PBtools.h:38
#define UNPACKING
Definition PBtools.h:54
void PB_CVMinit()
Int PB_Cfirstnb()
#define MAX(a_, b_)
Definition PBtools.h:77
#define MB_
Definition PBtools.h:43
char * PB_Cmalloc()
void PB_Cinfog2l()
#define MModSub(I1, I2, d)
Definition PBtools.h:102
#define PACKING
Definition PBtools.h:53
#define MIN(a_, b_)
Definition PBtools.h:76
#define Mptr(a_, i_, j_, lda_, siz_)
Definition PBtools.h:132
#define LLD_
Definition PBtools.h:47
Int PB_Cnumroc()
Int PB_CVMpack()
char * PB_Ctop()
void PB_CInV()
void PB_CVMupdate()
#define RSRC_
Definition PBtools.h:45
void PB_Cdescset()
void PB_COutV()
#define MModAdd1(I, d)
Definition PBtools.h:100
#define MModAdd(I1, I2, d)
Definition PBtools.h:97
#define INB_
Definition PBtools.h:42
Int PB_CVMnpq()
#define MModSub1(I, d)
Definition PBtools.h:105
#define CSRC_
Definition PBtools.h:46
Int PB_Clcm()
#define IMB_
Definition PBtools.h:41
Int PB_Cindxg2p()
Int PB_Cg2lrem()
void PB_CpgemmBC()
#define Mupcase(C)
Definition PBtools.h:83
#define DLEN_
Definition PBtools.h:48
#define NB_
Definition PBtools.h:44
void PB_Cconjg()
void PB_CVMcontig()
void PB_Cpaxpby()
void PB_Cdescribe()
#define TYPE
Definition clamov.c:7