ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
PB_CpgemmBC.c
Go to the documentation of this file.
1 /* ---------------------------------------------------------------------
2 *
3 * -- PBLAS auxiliary routine (version 2.0) --
4 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5 * and University of California, Berkeley.
6 * April 1, 1998
7 *
8 * ---------------------------------------------------------------------
9 */
10 /*
11 * Include files
12 */
13 #include "../pblas.h"
14 #include "../PBpblas.h"
15 #include "../PBtools.h"
16 #include "../PBblacs.h"
17 #include "../PBblas.h"
18 
19 #ifdef __STDC__
20 void PB_CpgemmBC( PBTYP_T * TYPE, char * DIRECB, char * DIRECC,
21  char * TRANSA, char * TRANSB, int M, int N, int K,
22  char * ALPHA, char * A, int IA, int JA, int * DESCA,
23  char * B, int IB, int JB, int * DESCB, char * BETA,
24  char * C, int IC, int JC, int * DESCC )
25 #else
26 void PB_CpgemmBC( TYPE, DIRECB, DIRECC, TRANSA, TRANSB, M, N, K, ALPHA,
27  A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC,
28  DESCC )
29 /*
30 * .. Scalar Arguments ..
31 */
32  char * DIRECB, * DIRECC, * TRANSA, * TRANSB;
33  int IA, IB, IC, JA, JB, JC, K, M, N;
34  char * ALPHA, * BETA;
35  PBTYP_T * TYPE;
36 /*
37 * .. Array Arguments ..
38 */
39  int * DESCA, * DESCB, * DESCC;
40  char * A, * B, * C;
41 #endif
42 {
43 /*
44 * Purpose
45 * =======
46 *
47 * PB_CpgemmBC performs one of the matrix-matrix operations
48 *
49 * sub( C ) := alpha*op( sub( A ) )*op( sub( B ) ) + beta*sub( C ),
50 *
51 * where
52 *
53 * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), and, op( X ) is one of
54 * op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ).
55 *
56 * Thus, op( sub( A ) ) denotes A(IA:IA+M-1,JA:JA+K-1) if TRANSA = 'N',
57 * A(IA:IA+K-1,JA:JA+M-1)' if TRANSA = 'T',
58 * conjg(A(IA:IA+K-1,JA:JA+M-1)') if TRANSA = 'C',
59 *
60 * and, op( sub( B ) ) denotes B(IB:IB+K-1,JB:JB+N-1) if TRANSB = 'N',
61 * B(IB:IB+N-1,JB:JB+K-1)' if TRANSB = 'T',
62 * conjg(B(IB:IB+N-1,JB:JB+K-1)') if TRANSB = 'C'.
63 *
64 * Alpha and beta are scalars. A, B and C are matrices; op( sub( A ) )
65 * is an m by k submatrix, op( sub( B ) ) is an k by n submatrix and
66 * sub( C ) is an m by n submatrix.
67 *
68 * This is the inner-product algorithm using the logical LCM algorithmic
69 * blocking technique. The submatrix operand sub( A ) stays in place.
70 *
71 * Notes
72 * =====
73 *
74 * A description vector is associated with each 2D block-cyclicly dis-
75 * tributed matrix. This vector stores the information required to
76 * establish the mapping between a matrix entry and its corresponding
77 * process and memory location.
78 *
79 * In the following comments, the character _ should be read as
80 * "of the distributed matrix". Let A be a generic term for any 2D
81 * block cyclicly distributed matrix. Its description vector is DESC_A:
82 *
83 * NOTATION STORED IN EXPLANATION
84 * ---------------- --------------- ------------------------------------
85 * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
86 * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
87 * the NPROW x NPCOL BLACS process grid
88 * A is distributed over. The context
89 * itself is global, but the handle
90 * (the integer value) may vary.
91 * M_A (global) DESCA[ M_ ] The number of rows in the distribu-
92 * ted matrix A, M_A >= 0.
93 * N_A (global) DESCA[ N_ ] The number of columns in the distri-
94 * buted matrix A, N_A >= 0.
95 * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
96 * block of the matrix A, IMB_A > 0.
97 * INB_A (global) DESCA[ INB_ ] The number of columns of the upper
98 * left block of the matrix A,
99 * INB_A > 0.
100 * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
101 * bute the last M_A-IMB_A rows of A,
102 * MB_A > 0.
103 * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
104 * bute the last N_A-INB_A columns of
105 * A, NB_A > 0.
106 * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
107 * row of the matrix A is distributed,
108 * NPROW > RSRC_A >= 0.
109 * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
110 * first column of A is distributed.
111 * NPCOL > CSRC_A >= 0.
112 * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
113 * array storing the local blocks of
114 * the distributed matrix A,
115 * IF( Lc( 1, N_A ) > 0 )
116 * LLD_A >= MAX( 1, Lr( 1, M_A ) )
117 * ELSE
118 * LLD_A >= 1.
119 *
120 * Let K be the number of rows of a matrix A starting at the global in-
121 * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
122 * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
123 * receive if these K rows were distributed over NPROW processes. If K
124 * is the number of columns of a matrix A starting at the global index
125 * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
126 * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
127 * these K columns were distributed over NPCOL processes.
128 *
129 * The values of Lr() and Lc() may be determined via a call to the func-
130 * tion PB_Cnumroc:
131 * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
132 * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
133 *
134 * Arguments
135 * =========
136 *
137 * TYPE (local input) pointer to a PBTYP_T structure
138 * On entry, TYPE is a pointer to a structure of type PBTYP_T,
139 * that contains type information (See pblas.h).
140 *
141 * DIRECB (global input) pointer to CHAR
142 * On entry, DIRECB specifies the direction in which the rows
143 * or columns of sub( B ) should be looped over as follows:
144 * DIRECB = 'F' or 'f' forward or increasing,
145 * DIRECB = 'B' or 'b' backward or decreasing.
146 *
147 * DIRECC (global input) pointer to CHAR
148 * On entry, DIRECC specifies the direction in which the rows
149 * or columns of sub( C ) should be looped over as follows:
150 * DIRECC = 'F' or 'f' forward or increasing,
151 * DIRECC = 'B' or 'b' backward or decreasing.
152 *
153 * TRANSA (global input) pointer to CHAR
154 * On entry, TRANSA specifies the form of op( sub( A ) ) to be
155 * used in the matrix multiplication as follows:
156 *
157 * TRANSA = 'N' or 'n' op( sub( A ) ) = sub( A ),
158 * TRANSA = 'T' or 't' op( sub( A ) ) = sub( A )',
159 * TRANSA = 'C' or 'c' op( sub( A ) ) = conjg( sub( A )' ).
160 *
161 * TRANSB (global input) pointer to CHAR
162 * On entry, TRANSB specifies the form of op( sub( B ) ) to be
163 * used in the matrix multiplication as follows:
164 *
165 * TRANSB = 'N' or 'n' op( sub( B ) ) = sub( B ),
166 * TRANSB = 'T' or 't' op( sub( B ) ) = sub( B )',
167 * TRANSB = 'C' or 'c' op( sub( B ) ) = conjg( sub( B )' ).
168 *
169 * M (global input) INTEGER
170 * On entry, M specifies the number of rows of the submatrix
171 * op( sub( A ) ) and of the submatrix sub( C ). M must be at
172 * least zero.
173 *
174 * N (global input) INTEGER
175 * On entry, N specifies the number of columns of the submatrix
176 * op( sub( B ) ) and the number of columns of the submatrix
177 * sub( C ). N must be at least zero.
178 *
179 * K (global input) INTEGER
180 * On entry, K specifies the number of columns of the submatrix
181 * op( sub( A ) ) and the number of rows of the submatrix
182 * op( sub( B ) ). K must be at least zero.
183 *
184 * ALPHA (global input) pointer to CHAR
185 * On entry, ALPHA specifies the scalar alpha. When ALPHA is
186 * supplied as zero then the local entries of the arrays A and
187 * B corresponding to the entries of the submatrices sub( A )
188 * and sub( B ) respectively need not be set on input.
189 *
190 * A (local input) pointer to CHAR
191 * On entry, A is an array of dimension (LLD_A, Ka), where Ka is
192 * at least Lc( 1, JA+K-1 ) when TRANSA = 'N' or 'n', and is at
193 * least Lc( 1, JA+M-1 ) otherwise. Before entry, this array
194 * contains the local entries of the matrix A.
195 *
196 * IA (global input) INTEGER
197 * On entry, IA specifies A's global row index, which points to
198 * the beginning of the submatrix sub( A ).
199 *
200 * JA (global input) INTEGER
201 * On entry, JA specifies A's global column index, which points
202 * to the beginning of the submatrix sub( A ).
203 *
204 * DESCA (global and local input) INTEGER array
205 * On entry, DESCA is an integer array of dimension DLEN_. This
206 * is the array descriptor for the matrix A.
207 *
208 * B (local input) pointer to CHAR
209 * On entry, B is an array of dimension (LLD_B, Kb), where Kb is
210 * at least Lc( 1, JB+N-1 ) when TRANSB = 'N' or 'n', and is at
211 * least Lc( 1, JB+K-1 ) otherwise. Before entry, this array
212 * contains the local entries of the matrix B.
213 *
214 * IB (global input) INTEGER
215 * On entry, IB specifies B's global row index, which points to
216 * the beginning of the submatrix sub( B ).
217 *
218 * JB (global input) INTEGER
219 * On entry, JB specifies B's global column index, which points
220 * to the beginning of the submatrix sub( B ).
221 *
222 * DESCB (global and local input) INTEGER array
223 * On entry, DESCB is an integer array of dimension DLEN_. This
224 * is the array descriptor for the matrix B.
225 *
226 * BETA (global input) pointer to CHAR
227 * On entry, BETA specifies the scalar beta. When BETA is
228 * supplied as zero then the local entries of the array C
229 * corresponding to the entries of the submatrix sub( C ) need
230 * not be set on input.
231 *
232 * C (local input/local output) pointer to CHAR
233 * On entry, C is an array of dimension (LLD_C, Kc), where Kc is
234 * at least Lc( 1, JC+N-1 ). Before entry, this array contains
235 * the local entries of the matrix C.
236 * On exit, the entries of this array corresponding to the local
237 * entries of the submatrix sub( C ) are overwritten by the
238 * local entries of the m by n updated submatrix.
239 *
240 * IC (global input) INTEGER
241 * On entry, IC specifies C's global row index, which points to
242 * the beginning of the submatrix sub( C ).
243 *
244 * JC (global input) INTEGER
245 * On entry, JC specifies C's global column index, which points
246 * to the beginning of the submatrix sub( C ).
247 *
248 * DESCC (global and local input) INTEGER array
249 * On entry, DESCC is an integer array of dimension DLEN_. This
250 * is the array descriptor for the matrix C.
251 *
252 * -- Written on April 1, 1998 by
253 * Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
254 *
255 * ---------------------------------------------------------------------
256 */
257 /*
258 * .. Local Scalars ..
259 */
260  char Broc, GemmTa, GemmTb, TrA, TrB, * one, * talpha, * tbeta,
261  top, * zero;
262  int Acol, Aii, Aimb1, Ainb1, Ajj, Ald, Am, Amb, Amp, An, Anb,
263  Anq, Arow, Bbufld, BcurrocR, Bfr, Bfwd, BiD, BiR, BiiD, BiiR,
264  BinbD, BinbR, Binb1D, Binb1R, BisR, Bkk, Bld, BmyprocD,
265  BmyprocR, BnbD, BnbR, BnpD, BnpR, BnprocsD, BnprocsR, Boff,
266  BrocD, BrocR, BsrcR, Bsrc_, Cbufld, Ccol, Ccurcol, Cfr, Cfwd,
267  Cii, Cimb, Cimb1, Cinb, Cinb1, CisR, Cjj, Ckk, Cld, Cmb, Cmp,
268  Cnb, Cnq, Coff, Crow, Csrc, WBfr, WCfr, WCsum, ctxt, lcmb,
269  maxp, maxpm1, maxq, mycol, myrow, n, nb, nbb, ncpq, nota,
270  notb, npcol, npq=0, nprow, nrpq, p=0, q=0, size, tmp;
271  GEMM_T gemm;
272  GSUM2D_T gsum2d;
273 /*
274 * .. Local Arrays ..
275 */
276  int Ad0[DLEN_], DBUFB[DLEN_], DBUFC[DLEN_], WBd[DLEN_],
277  WCd[DLEN_];
278  PB_VM_T VM;
279  char * Aptr = NULL, * Bbuf = NULL, * Cbuf = NULL, * WB = NULL,
280  * WC = NULL;
281 /* ..
282 * .. Executable Statements ..
283 *
284 */
285  Cblacs_gridinfo( ( ctxt = DESCC[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
286 
287  Bfwd = ( Mupcase( DIRECB[0] ) == CFORWARD );
288  Cfwd = ( Mupcase( DIRECC[0] ) == CFORWARD );
289  nota = ( ( TrA = Mupcase( TRANSA[0] ) ) == CNOTRAN );
290  notb = ( ( TrB = Mupcase( TRANSB[0] ) ) == CNOTRAN );
291 
292  size = TYPE->size; one = TYPE->one; zero = TYPE->zero;
293  gemm = TYPE->Fgemm; gsum2d = TYPE->Cgsum2d;
294  nb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) );
295 /*
296 * Compute local information for sub( A ), sub( B ) and sub( C )
297 */
298  if( notb )
299  {
300  BiD = IB; BiR = JB;
301  Bsrc_ = CSRC_; Broc = CCOLUMN;
302  BinbD = DESCB[IMB_ ]; BinbR = DESCB[INB_];
303  BnbD = DESCB[MB_ ]; BnbR = DESCB[NB_ ];
304  BsrcR = DESCB[Bsrc_]; Bld = DESCB[LLD_];
305  BmyprocD = myrow; BnprocsD = nprow;
306  BmyprocR = mycol; BnprocsR = npcol;
307  PB_Cinfog2l( IB, JB, DESCB, BnprocsD, BnprocsR, BmyprocD, BmyprocR,
308  &BiiD, &BiiR, &BrocD, &BrocR );
309  }
310  else
311  {
312  BiD = JB; BiR = IB;
313  Bsrc_ = RSRC_; Broc = CROW;
314  BinbR = DESCB[IMB_ ]; BinbD = DESCB[INB_];
315  BnbR = DESCB[MB_ ]; BnbD = DESCB[NB_ ];
316  BsrcR = DESCB[Bsrc_]; Bld = DESCB[LLD_];
317  BmyprocD = mycol; BnprocsD = npcol;
318  BmyprocR = myrow; BnprocsR = nprow;
319  PB_Cinfog2l( IB, JB, DESCB, BnprocsR, BnprocsD, BmyprocR, BmyprocD,
320  &BiiR, &BiiD, &BrocR, &BrocD );
321  }
322  Binb1D = PB_Cfirstnb( K, BiD, BinbD, BnbD );
323  BnpD = PB_Cnumroc( K, 0, Binb1D, BnbD, BmyprocD, BrocD, BnprocsD );
324  Binb1R = PB_Cfirstnb( N, BiR, BinbR, BnbR );
325 
326  Cimb = DESCC[IMB_ ]; Cinb = DESCC[INB_];
327  Cmb = DESCC[MB_ ]; Cnb = DESCC[NB_ ];
328  Csrc = DESCC[CSRC_]; Cld = DESCC[LLD_];
329  PB_Cinfog2l( IC, JC, DESCC, nprow, npcol, myrow, mycol, &Cii, &Cjj,
330  &Crow, &Ccol );
331  Cimb1 = PB_Cfirstnb( M, IC, Cimb, Cmb );
332  Cmp = PB_Cnumroc( M, 0, Cimb1, Cmb, myrow, Crow, nprow );
333  Cinb1 = PB_Cfirstnb( N, JC, Cinb, Cnb );
334 /*
335 * Retrieve the BLACS combine topology, compute conjugate of alpha for the
336 * conjugate transpose case and set the transpose parameters to be passed to
337 * the BLAS matrix multiply routine.
338 */
339  if( nota )
340  {
341  Am = M; An = K;
342  top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
343  talpha = ALPHA; GemmTa = CNOTRAN; GemmTb = ( notb ? CTRAN : TrB );
344  }
345  else
346  {
347  Am = K; An = M;
348  top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
349  if( TrA == CCOTRAN )
350  {
351  talpha = PB_Cmalloc( size ); PB_Cconjg( TYPE, ALPHA, talpha );
352  GemmTa = ( ( TrB == CCOTRAN ) ? CTRAN : CCOTRAN );
353  }
354  else
355  {
356  talpha = ALPHA;
357  GemmTa = ( ( TrB == CCOTRAN ) ? CCOTRAN : CTRAN );
358  }
359  GemmTb = CNOTRAN;
360  }
361 /*
362 * Compute descriptor Ad0 for sub( A )
363 */
364  PB_Cdescribe( Am, An, IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj,
365  &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 );
366 
367  Amp = PB_Cnumroc( Am, 0, Aimb1, Amb, myrow, Arow, nprow );
368  Anq = PB_Cnumroc( An, 0, Ainb1, Anb, mycol, Acol, npcol );
369  if( ( Amp > 0 ) && ( Anq > 0 ) ) { Aptr = Mptr( A, Aii, Ajj, Ald, size ); }
370 /*
371 * When sub( B ) is not replicated and backward pass on sub( B ), find the
372 * virtual process q owning the last row or column of sub( B ).
373 */
374  if( !( BisR = ( ( BsrcR < 0 ) || ( BnprocsR == 1 ) ) ) && !Bfwd )
375  {
376  tmp = PB_Cindxg2p( N - 1, Binb1R, BnbR, BrocR, BrocR, BnprocsR );
377  q = MModSub( tmp, BrocR, BnprocsR );
378  }
379 /*
380 * When sub( C ) is not replicated and backward pass on sub( C ), find the
381 * virtual process p owning the last row or column of sub( C ).
382 */
383  if( !( CisR = ( ( Ccol < 0 ) || ( npcol == 1 ) ) ) && !Cfwd )
384  {
385  tmp = PB_Cindxg2p( N - 1, Cinb1, Cnb, Ccol, Ccol, npcol );
386  p = MModSub( tmp, Ccol, npcol );
387  }
388 /*
389 * Loop over the virtual process grid induced by the rows or columns of
390 * sub( B ) and sub( C ).
391 */
392  lcmb = PB_Clcm( ( maxp = ( CisR ? 1 : npcol ) ) * Cnb,
393  ( maxq = ( BisR ? 1 : BnprocsR ) ) * BnbR );
394  n = N;
395  maxpm1 = maxp - 1;
396 
397  while( n > 0 )
398  {
399 /*
400 * Initialize local virtual matrix in process (p,q)
401 */
402  BcurrocR = ( BisR ? -1 : MModAdd( BrocR, q, BnprocsR ) );
403  Bkk = PB_Cg2lrem( BiR, BinbR, BnbR, BcurrocR, BsrcR, BnprocsR );
404  BnpR = PB_Cnumroc( N, 0, Binb1R, BnbR, BcurrocR, BrocR, BnprocsR );
405 
406  Ccurcol = ( CisR ? -1 : MModAdd( Ccol, p, npcol ) );
407  Ckk = PB_Cg2lrem( JC, Cinb, Cnb, Ccurcol, Csrc, npcol );
408  Cnq = PB_Cnumroc( N, 0, Cinb1, Cnb, Ccurcol, Ccol, npcol );
409 
410  PB_CVMinit( &VM, 0, Cnq, BnpR, Cinb1, Binb1R, Cnb, BnbR, p, q,
411  maxp, maxq, lcmb );
412 /*
413 * Find how many diagonals in this virtual process
414 */
415  npq = PB_CVMnpq( &VM );
416 
417  n -= npq;
418 /*
419 * Re-adjust the number of rows or columns to be (un)packed, in order to
420 * average the message sizes.
421 */
422  if( npq ) nbb = npq / ( ( npq - 1 ) / nb + 1 );
423 
424  while( npq )
425  {
426  nbb = MIN( nbb, npq );
427 /*
428 * Find out how many rows or columns of sub( B ) and sub( C ) are contiguous
429 */
430  PB_CVMcontig( &VM, &nrpq, &ncpq, &Coff, &Boff );
431 
432  if( notb )
433  {
434 /*
435 * Compute the descriptor DBUFB for the buffer that will contained the packed
436 * columns of sub( B ).
437 */
438  if( ( Bfr = ( ncpq < nbb ) ) != 0 )
439  {
440 /*
441 * If columns of sub( B ) are not contiguous, then allocate the buffer and
442 * pack the nbb columns of sub( B ).
443 */
444  Bbufld = MAX( 1, BnpD );
445  if( BisR || ( BmyprocR == BcurrocR ) )
446  {
447  Bbuf = PB_Cmalloc( BnpD * nbb * size );
448  PB_CVMpack( TYPE, &VM, COLUMN, &Broc, PACKING, NOTRAN, nbb,
449  BnpD, one, Mptr( B, BiiD, Bkk, Bld, size ), Bld,
450  zero, Bbuf, Bbufld );
451  }
452  }
453  else
454  {
455 /*
456 * Otherwise, re-use sub( B ) directly.
457 */
458  Bbufld = Bld;
459  if( BisR || ( BmyprocR == BcurrocR ) )
460  Bbuf = Mptr( B, BiiD, Bkk+Boff, Bld, size );
461  }
462  PB_Cdescset( DBUFB, K, nbb, Binb1D, nbb, BnbD, nbb, BrocD,
463  BcurrocR, ctxt, Bbufld );
464  }
465  else
466  {
467 /*
468 * Compute the descriptor DBUFB for the buffer that will contained the packed
469 * rows of sub( B ).
470 */
471  if( ( Bfr = ( ncpq < nbb ) ) != 0 )
472  {
473 /*
474 * If rows of sub( B ) are not contiguous, then allocate the buffer and pack
475 * the nbb rows of sub( B ).
476 */
477  Bbufld = nbb;
478  if( BisR || ( BmyprocR == BcurrocR ) )
479  {
480  Bbuf = PB_Cmalloc( BnpD * nbb * size );
481  PB_CVMpack( TYPE, &VM, COLUMN, &Broc, PACKING, NOTRAN, nbb,
482  BnpD, one, Mptr( B, Bkk, BiiD, Bld, size ), Bld,
483  zero, Bbuf, Bbufld );
484  }
485  }
486  else
487  {
488 /*
489 * Otherwise, re-use sub( B ) directly.
490 */
491  Bbufld = Bld;
492  if( BisR || ( BmyprocR == BcurrocR ) )
493  Bbuf = Mptr( B, Bkk+Boff, BiiD, Bld, size );
494  }
495  PB_Cdescset( DBUFB, nbb, K, nbb, Binb1D, nbb, BnbD, BcurrocR,
496  BrocD, ctxt, Bbufld );
497  }
498 
499  if( nota )
500  {
501 /*
502 * Replicate this panel of rows or columns of sub( B ) over sub( A ) -> WB
503 */
504  PB_CInV( TYPE, NOCONJG, ROW, Am, An, Ad0, nbb, Bbuf, 0, 0,
505  DBUFB, &Broc, &WB, WBd, &WBfr );
506 /*
507 * Allocate space for temporary results in scope of sub( A ) -> WC
508 */
509  PB_COutV( TYPE, COLUMN, INIT, Am, An, Ad0, nbb, &WC, WCd, &WCfr,
510  &WCsum );
511 /*
512 * Local matrix-matrix multiply iff I own some data
513 */
514  if( Amp > 0 && Anq > 0 )
515  gemm( C2F_CHAR( &GemmTa ), C2F_CHAR( &GemmTb ), &Amp, &nbb,
516  &Anq, talpha, Aptr, &Ald, WB, &WBd[LLD_], zero,
517  WC, &WCd[LLD_] );
518  if( WBfr ) free( WB );
519  if( Bfr && ( BisR || ( BmyprocR == BcurrocR ) ) )
520  if( Bbuf ) free( Bbuf );
521 /*
522 * Accumulate the intermediate results in WC
523 */
524  if( WCsum )
525  {
526  WCd[CSRC_] = Ccurcol;
527  if( Amp > 0 )
528  gsum2d( ctxt, ROW, &top, Amp, nbb, WC, WCd[LLD_], myrow,
529  WCd[CSRC_] );
530  }
531 /*
532 * Compute the descriptor DBUFC for the buffer that will contained the packed
533 * columns of sub( C ). Allocate it.
534 */
535  if( ( Cfr = ( nrpq < nbb ) ) != 0 )
536  {
537 /*
538 * If columns of sub( C ) are not contiguous, then allocate the buffer
539 */
540  Cbufld = MAX( 1, Cmp ); tbeta = zero;
541  if( CisR || ( mycol == Ccurcol ) )
542  Cbuf = PB_Cmalloc( Cmp * nbb * size );
543  }
544  else
545  {
546 /*
547 * Otherwise re-use sub( C )
548 */
549  Cbufld = Cld; tbeta = BETA;
550  if( CisR || ( mycol == Ccurcol ) )
551  Cbuf = Mptr( C, Cii, Ckk+Coff, Cld, size );
552  }
553  PB_Cdescset( DBUFC, M, nbb, Cimb1, nbb, Cmb, nbb, Crow, Ccurcol,
554  ctxt, Cbufld );
555 /*
556 * Cbuf := Cbuf + WC
557 */
558  PB_Cpaxpby( TYPE, NOCONJG, M, nbb, one, WC, 0, 0, WCd, COLUMN,
559  tbeta, Cbuf, 0, 0, DBUFC, COLUMN );
560 /*
561 * Unpack the nbb columns of sub( C ) and release the buffer containing them.
562 */
563  if( Cfr && ( CisR || ( mycol == Ccurcol ) ) )
564  {
565  PB_CVMpack( TYPE, &VM, ROW, COLUMN, UNPACKING, NOTRAN, nbb, Cmp,
566  BETA, Mptr( C, Cii, Ckk, Cld, size ), Cld, one, Cbuf,
567  Cbufld );
568  if( Cbuf ) free( Cbuf );
569  }
570  if( WCfr ) free( WC );
571  }
572  else
573  {
574 /*
575 * Replicate this panel of rows or columns of sub( B ) over sub( A ) -> WB
576 */
577  PB_CInV( TYPE, NOCONJG, COLUMN, Am, An, Ad0, nbb, Bbuf, 0, 0,
578  DBUFB, &Broc, &WB, WBd, &WBfr );
579 /*
580 * Allocate space for temporary results in scope of sub( A ) -> WC
581 */
582  PB_COutV( TYPE, ROW, INIT, Am, An, Ad0, nbb, &WC, WCd, &WCfr,
583  &WCsum );
584 /*
585 * Local matrix-matrix multiply iff I own some data
586 */
587  if( Amp > 0 && Anq > 0 )
588  gemm( C2F_CHAR( &GemmTa ), C2F_CHAR( &GemmTb ), &nbb, &Anq,
589  &Amp, talpha, WB, &WBd[LLD_], Aptr, &Ald, zero, WC,
590  &WCd[LLD_] );
591  if( WBfr ) free( WB );
592  if( Bfr && ( BisR || ( BmyprocR == BcurrocR ) ) )
593  if( Bbuf ) free( Bbuf );
594 /*
595 * Accumulate the intermediate results in WC
596 */
597  if( WCsum )
598  {
599  WCd[RSRC_] = 0;
600  if( Anq > 0 )
601  gsum2d( ctxt, COLUMN, &top, nbb, Anq, WC, WCd[LLD_],
602  WCd[RSRC_], mycol );
603  }
604 /*
605 * Compute the descriptor DBUFC for the buffer that will contained the packed
606 * columns of sub( C ). Allocate it.
607 */
608  if( ( Cfr = ( nrpq < nbb ) ) != 0 )
609  {
610 /*
611 * If columns of sub( C ) are not contiguous, then allocate the buffer
612 */
613  Cbufld = MAX( 1, Cmp ); tbeta = zero;
614  if( CisR || ( mycol == Ccurcol ) )
615  Cbuf = PB_Cmalloc( Cmp * nbb * size );
616  }
617  else
618  {
619 /*
620 * Otherwise re-use sub( C )
621 */
622  Cbufld = Cld; tbeta = BETA;
623  if( CisR || ( mycol == Ccurcol ) )
624  Cbuf = Mptr( C, Cii, Ckk+Coff, Cld, size );
625  }
626  PB_Cdescset( DBUFC, M, nbb, Cimb1, nbb, Cmb, nbb, Crow, Ccurcol,
627  ctxt, Cbufld );
628 /*
629 * Cbuf := Cbuf + WC'
630 */
631  PB_Cpaxpby( TYPE, ( TrA == CCOTRAN ? CONJG : NOCONJG ), nbb, M,
632  one, WC, 0, 0, WCd, ROW, tbeta, Cbuf, 0, 0, DBUFC,
633  COLUMN );
634 /*
635 * Unpack the nbb columns of sub( C ) and release the buffer containing them.
636 */
637  if( Cfr && ( CisR || ( mycol == Ccurcol ) ) )
638  {
639  PB_CVMpack( TYPE, &VM, ROW, COLUMN, UNPACKING, NOTRAN, nbb, Cmp,
640  BETA, Mptr( C, Cii, Ckk, Cld, size ), Cld, one, Cbuf,
641  Cbufld );
642  if( Cbuf ) free( Cbuf );
643  }
644  if( WCfr ) free( WC );
645  }
646 /*
647 * Update the local indexes of sub( B ) and sub( C )
648 */
649  PB_CVMupdate( &VM, nbb, &Ckk, &Bkk );
650 
651  npq -= nbb;
652  }
653 /*
654 * Go to next or previous virtual process row or column
655 */
656  if( ( Cfwd && ( p == maxpm1 ) ) ||
657  ( !( Cfwd ) && ( p == 0 ) ) )
658  q = ( Bfwd ? MModAdd1( q, maxq ) : MModSub1( q, maxq ) );
659  p = ( Cfwd ? MModAdd1( p, maxp ) : MModSub1( p, maxp ) );
660  }
661 
662  if( TrA == CCOTRAN ) free( talpha );
663 /*
664 * End of PB_CpgemmBC
665 */
666 }
TYPE
#define TYPE
Definition: clamov.c:7
ROW
#define ROW
Definition: PBblacs.h:46
MB_
#define MB_
Definition: PBtools.h:43
PB_Cpaxpby
void PB_Cpaxpby()
PB_CVMcontig
void PB_CVMcontig()
NB_
#define NB_
Definition: PBtools.h:44
COLUMN
#define COLUMN
Definition: PBblacs.h:45
CSRC_
#define CSRC_
Definition: PBtools.h:46
UNPACKING
#define UNPACKING
Definition: PBtools.h:54
PB_Cg2lrem
int PB_Cg2lrem()
PB_Cconjg
void PB_Cconjg()
CCOTRAN
#define CCOTRAN
Definition: PBblas.h:22
NOCONJG
#define NOCONJG
Definition: PBblas.h:45
PB_Cfirstnb
int PB_Cfirstnb()
DLEN_
#define DLEN_
Definition: PBtools.h:48
GEMM_T
F_VOID_FCT(* GEMM_T)()
Definition: pblas.h:313
NOTRAN
#define NOTRAN
Definition: PBblas.h:44
LLD_
#define LLD_
Definition: PBtools.h:47
PB_Cdescribe
void PB_Cdescribe()
PB_CpgemmBC
void PB_CpgemmBC(PBTYP_T *TYPE, char *DIRECB, char *DIRECC, char *TRANSA, char *TRANSB, int M, int N, int K, char *ALPHA, char *A, int IA, int JA, int *DESCA, char *B, int IB, int JB, int *DESCB, char *BETA, char *C, int IC, int JC, int *DESCC)
Definition: PB_CpgemmBC.c:26
PB_CVMinit
void PB_CVMinit()
PB_CVMpack
int PB_CVMpack()
MModAdd
#define MModAdd(I1, I2, d)
Definition: PBtools.h:97
CROW
#define CROW
Definition: PBblacs.h:21
IMB_
#define IMB_
Definition: PBtools.h:41
MModSub
#define MModSub(I1, I2, d)
Definition: PBtools.h:102
pilaenv_
int pilaenv_()
INIT
#define INIT
Definition: PBblas.h:61
PB_Cdescset
void PB_Cdescset()
PB_CVMupdate
void PB_CVMupdate()
MModAdd1
#define MModAdd1(I, d)
Definition: PBtools.h:100
PB_Cindxg2p
int PB_Cindxg2p()
TOP_GET
#define TOP_GET
Definition: PBblacs.h:50
GSUM2D_T
void(* GSUM2D_T)()
Definition: pblas.h:282
PB_Ctop
char * PB_Ctop()
RSRC_
#define RSRC_
Definition: PBtools.h:45
CNOTRAN
#define CNOTRAN
Definition: PBblas.h:18
COMBINE
#define COMBINE
Definition: PBblacs.h:49
CONJG
#define CONJG
Definition: PBblas.h:47
PB_Cinfog2l
void PB_Cinfog2l()
PB_Cnumroc
int PB_Cnumroc()
PB_Cmalloc
char * PB_Cmalloc()
CFORWARD
#define CFORWARD
Definition: PBblas.h:38
PB_CInV
void PB_CInV()
PB_VM_T
Definition: pblas.h:432
PB_CVMnpq
int PB_CVMnpq()
MIN
#define MIN(a_, b_)
Definition: PBtools.h:76
CCOLUMN
#define CCOLUMN
Definition: PBblacs.h:20
PACKING
#define PACKING
Definition: PBtools.h:53
INB_
#define INB_
Definition: PBtools.h:42
PB_COutV
void PB_COutV()
C2F_CHAR
#define C2F_CHAR(a)
Definition: pblas.h:121
MModSub1
#define MModSub1(I, d)
Definition: PBtools.h:105
MAX
#define MAX(a_, b_)
Definition: PBtools.h:77
Cblacs_gridinfo
void Cblacs_gridinfo()
PBTYP_T
Definition: pblas.h:325
Mupcase
#define Mupcase(C)
Definition: PBtools.h:83
CTRAN
#define CTRAN
Definition: PBblas.h:20
Mptr
#define Mptr(a_, i_, j_, lda_, siz_)
Definition: PBtools.h:132
CTXT_
#define CTXT_
Definition: PBtools.h:38
PB_Clcm
int PB_Clcm()