ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
PB_CpgemmAC.c
Go to the documentation of this file.
1 /* ---------------------------------------------------------------------
2 *
3 * -- PBLAS auxiliary routine (version 2.0) --
4 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5 * and University of California, Berkeley.
6 * April 1, 1998
7 *
8 * ---------------------------------------------------------------------
9 */
10 /*
11 * Include files
12 */
13 #include "../pblas.h"
14 #include "../PBpblas.h"
15 #include "../PBtools.h"
16 #include "../PBblacs.h"
17 #include "../PBblas.h"
18 
19 #ifdef __STDC__
20 void PB_CpgemmAC( PBTYP_T * TYPE, char * DIRECA, char * DIRECC,
21  char * TRANSA, char * TRANSB, int M, int N, int K,
22  char * ALPHA, char * A, int IA, int JA, int * DESCA,
23  char * B, int IB, int JB, int * DESCB, char * BETA,
24  char * C, int IC, int JC, int * DESCC )
25 #else
26 void PB_CpgemmAC( TYPE, DIRECA, DIRECC, TRANSA, TRANSB, M, N, K, ALPHA,
27  A, IA, JA, DESCA, B, IB, JB, DESCB, BETA, C, IC, JC,
28  DESCC )
29 /*
30 * .. Scalar Arguments ..
31 */
32  char * DIRECA, * DIRECC, * TRANSA, * TRANSB;
33  int IA, IB, IC, JA, JB, JC, K, M, N;
34  char * ALPHA, * BETA;
35  PBTYP_T * TYPE;
36 /*
37 * .. Array Arguments ..
38 */
39  int * DESCA, * DESCB, * DESCC;
40  char * A, * B, * C;
41 #endif
42 {
43 /*
44 * Purpose
45 * =======
46 *
47 * PB_CpgemmAC performs one of the matrix-matrix operations
48 *
49 * sub( C ) := alpha*op( sub( A ) )*op( sub( B ) ) + beta*sub( C ),
50 *
51 * where
52 *
53 * sub( C ) denotes C(IC:IC+M-1,JC:JC+N-1), and, op( X ) is one of
54 * op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ).
55 *
56 * Thus, op( sub( A ) ) denotes A(IA:IA+M-1,JA:JA+K-1) if TRANSA = 'N',
57 * A(IA:IA+K-1,JA:JA+M-1)' if TRANSA = 'T',
58 * conjg(A(IA:IA+K-1,JA:JA+M-1)') if TRANSA = 'C',
59 *
60 * and, op( sub( B ) ) denotes B(IB:IB+K-1,JB:JB+N-1) if TRANSB = 'N',
61 * B(IB:IB+N-1,JB:JB+K-1)' if TRANSB = 'T',
62 * conjg(B(IB:IB+N-1,JB:JB+K-1)') if TRANSB = 'C'.
63 *
64 * Alpha and beta are scalars. A, B and C are matrices; op( sub( A ) )
65 * is an m by k submatrix, op( sub( B ) ) is an k by n submatrix and
66 * sub( C ) is an m by n submatrix.
67 *
68 * This is the inner-product algorithm using the logical LCM algorithmic
69 * blocking technique. The submatrix operand sub( B ) stays in place.
70 *
71 * Notes
72 * =====
73 *
74 * A description vector is associated with each 2D block-cyclicly dis-
75 * tributed matrix. This vector stores the information required to
76 * establish the mapping between a matrix entry and its corresponding
77 * process and memory location.
78 *
79 * In the following comments, the character _ should be read as
80 * "of the distributed matrix". Let A be a generic term for any 2D
81 * block cyclicly distributed matrix. Its description vector is DESC_A:
82 *
83 * NOTATION STORED IN EXPLANATION
84 * ---------------- --------------- ------------------------------------
85 * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
86 * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
87 * the NPROW x NPCOL BLACS process grid
88 * A is distributed over. The context
89 * itself is global, but the handle
90 * (the integer value) may vary.
91 * M_A (global) DESCA[ M_ ] The number of rows in the distribu-
92 * ted matrix A, M_A >= 0.
93 * N_A (global) DESCA[ N_ ] The number of columns in the distri-
94 * buted matrix A, N_A >= 0.
95 * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
96 * block of the matrix A, IMB_A > 0.
97 * INB_A (global) DESCA[ INB_ ] The number of columns of the upper
98 * left block of the matrix A,
99 * INB_A > 0.
100 * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
101 * bute the last M_A-IMB_A rows of A,
102 * MB_A > 0.
103 * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
104 * bute the last N_A-INB_A columns of
105 * A, NB_A > 0.
106 * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
107 * row of the matrix A is distributed,
108 * NPROW > RSRC_A >= 0.
109 * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
110 * first column of A is distributed.
111 * NPCOL > CSRC_A >= 0.
112 * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
113 * array storing the local blocks of
114 * the distributed matrix A,
115 * IF( Lc( 1, N_A ) > 0 )
116 * LLD_A >= MAX( 1, Lr( 1, M_A ) )
117 * ELSE
118 * LLD_A >= 1.
119 *
120 * Let K be the number of rows of a matrix A starting at the global in-
121 * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
122 * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
123 * receive if these K rows were distributed over NPROW processes. If K
124 * is the number of columns of a matrix A starting at the global index
125 * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
126 * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
127 * these K columns were distributed over NPCOL processes.
128 *
129 * The values of Lr() and Lc() may be determined via a call to the func-
130 * tion PB_Cnumroc:
131 * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
132 * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
133 *
134 * Arguments
135 * =========
136 *
137 * TYPE (local input) pointer to a PBTYP_T structure
138 * On entry, TYPE is a pointer to a structure of type PBTYP_T,
139 * that contains type information (See pblas.h).
140 *
141 * DIRECA (global input) pointer to CHAR
142 * On entry, DIRECA specifies the direction in which the rows
143 * or columns of sub( A ) should be looped over as follows:
144 * DIRECA = 'F' or 'f' forward or increasing,
145 * DIRECA = 'B' or 'b' backward or decreasing.
146 *
147 * DIRECC (global input) pointer to CHAR
148 * On entry, DIRECC specifies the direction in which the rows
149 * or columns of sub( C ) should be looped over as follows:
150 * DIRECC = 'F' or 'f' forward or increasing,
151 * DIRECC = 'B' or 'b' backward or decreasing.
152 *
153 * TRANSA (global input) pointer to CHAR
154 * On entry, TRANSA specifies the form of op( sub( A ) ) to be
155 * used in the matrix multiplication as follows:
156 *
157 * TRANSA = 'N' or 'n' op( sub( A ) ) = sub( A ),
158 * TRANSA = 'T' or 't' op( sub( A ) ) = sub( A )',
159 * TRANSA = 'C' or 'c' op( sub( A ) ) = conjg( sub( A )' ).
160 *
161 * TRANSB (global input) pointer to CHAR
162 * On entry, TRANSB specifies the form of op( sub( B ) ) to be
163 * used in the matrix multiplication as follows:
164 *
165 * TRANSB = 'N' or 'n' op( sub( B ) ) = sub( B ),
166 * TRANSB = 'T' or 't' op( sub( B ) ) = sub( B )',
167 * TRANSB = 'C' or 'c' op( sub( B ) ) = conjg( sub( B )' ).
168 *
169 * M (global input) INTEGER
170 * On entry, M specifies the number of rows of the submatrix
171 * op( sub( A ) ) and of the submatrix sub( C ). M must be at
172 * least zero.
173 *
174 * N (global input) INTEGER
175 * On entry, N specifies the number of columns of the submatrix
176 * op( sub( B ) ) and the number of columns of the submatrix
177 * sub( C ). N must be at least zero.
178 *
179 * K (global input) INTEGER
180 * On entry, K specifies the number of columns of the submatrix
181 * op( sub( A ) ) and the number of rows of the submatrix
182 * op( sub( B ) ). K must be at least zero.
183 *
184 * ALPHA (global input) pointer to CHAR
185 * On entry, ALPHA specifies the scalar alpha. When ALPHA is
186 * supplied as zero then the local entries of the arrays A and
187 * B corresponding to the entries of the submatrices sub( A )
188 * and sub( B ) respectively need not be set on input.
189 *
190 * A (local input) pointer to CHAR
191 * On entry, A is an array of dimension (LLD_A, Ka), where Ka is
192 * at least Lc( 1, JA+K-1 ) when TRANSA = 'N' or 'n', and is at
193 * least Lc( 1, JA+M-1 ) otherwise. Before entry, this array
194 * contains the local entries of the matrix A.
195 *
196 * IA (global input) INTEGER
197 * On entry, IA specifies A's global row index, which points to
198 * the beginning of the submatrix sub( A ).
199 *
200 * JA (global input) INTEGER
201 * On entry, JA specifies A's global column index, which points
202 * to the beginning of the submatrix sub( A ).
203 *
204 * DESCA (global and local input) INTEGER array
205 * On entry, DESCA is an integer array of dimension DLEN_. This
206 * is the array descriptor for the matrix A.
207 *
208 * B (local input) pointer to CHAR
209 * On entry, B is an array of dimension (LLD_B, Kb), where Kb is
210 * at least Lc( 1, JB+N-1 ) when TRANSB = 'N' or 'n', and is at
211 * least Lc( 1, JB+K-1 ) otherwise. Before entry, this array
212 * contains the local entries of the matrix B.
213 *
214 * IB (global input) INTEGER
215 * On entry, IB specifies B's global row index, which points to
216 * the beginning of the submatrix sub( B ).
217 *
218 * JB (global input) INTEGER
219 * On entry, JB specifies B's global column index, which points
220 * to the beginning of the submatrix sub( B ).
221 *
222 * DESCB (global and local input) INTEGER array
223 * On entry, DESCB is an integer array of dimension DLEN_. This
224 * is the array descriptor for the matrix B.
225 *
226 * BETA (global input) pointer to CHAR
227 * On entry, BETA specifies the scalar beta. When BETA is
228 * supplied as zero then the local entries of the array C
229 * corresponding to the entries of the submatrix sub( C ) need
230 * not be set on input.
231 *
232 * C (local input/local output) pointer to CHAR
233 * On entry, C is an array of dimension (LLD_C, Kc), where Kc is
234 * at least Lc( 1, JC+N-1 ). Before entry, this array contains
235 * the local entries of the matrix C.
236 * On exit, the entries of this array corresponding to the local
237 * entries of the submatrix sub( C ) are overwritten by the
238 * local entries of the m by n updated submatrix.
239 *
240 * IC (global input) INTEGER
241 * On entry, IC specifies C's global row index, which points to
242 * the beginning of the submatrix sub( C ).
243 *
244 * JC (global input) INTEGER
245 * On entry, JC specifies C's global column index, which points
246 * to the beginning of the submatrix sub( C ).
247 *
248 * DESCC (global and local input) INTEGER array
249 * On entry, DESCC is an integer array of dimension DLEN_. This
250 * is the array descriptor for the matrix C.
251 *
252 * -- Written on April 1, 1998 by
253 * Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
254 *
255 * ---------------------------------------------------------------------
256 */
257 /*
258 * .. Local Scalars ..
259 */
260  char Aroc, GemmTa, GemmTb, TrA, TrB, * one, * talpha, * tbeta,
261  top, * zero;
262  int Abufld, AcurrocR, Afr, Afwd, AiD, AiR, AiiD, AiiR, AinbD,
263  AinbR, Ainb1D, Ainb1R, AisR, Akk, Ald, AmyprocD, AmyprocR,
264  AnbD, AnbR, AnpD, AnpR, AnprocsD, AnprocsR, Aoff, ArocD,
265  ArocR, AsrcR, Asrc_, Bcol, Bii, Bimb1, Binb1, Bjj, Bld, Bm,
266  Bmb, Bmp, Bn, Bnb, Bnq, Brow, Cbufld, Ccol, Ccurrow, Cfr,
267  Cfwd, Cii, Cimb, Cimb1, Cinb, Cinb1, CisR, Cjj, Ckk, Cld,
268  Cmb, Cmp, Cnb, Cnq, Coff, Crow, Csrc, WAfr, WCfr, WCsum,
269  ctxt, lcmb, m, maxp, maxpm1, maxq, mb, mbb, mycol, myrow,
270  ncpq, nota, notb, npcol, npq=0, nprow, nrpq, p=0, q=0, size,
271  tmp;
272  GEMM_T gemm;
273  GSUM2D_T gsum2d;
274 /*
275 * .. Local Arrays ..
276 */
277  PB_VM_T VM;
278  int Bd0[DLEN_], DBUFA[DLEN_], DBUFC[DLEN_], WAd[DLEN_],
279  WCd[DLEN_];
280  char * Abuf = NULL, * Bptr = NULL, * Cbuf = NULL, * WA = NULL,
281  * WC = NULL;
282 /* ..
283 * .. Executable Statements ..
284 *
285 */
286  Cblacs_gridinfo( ( ctxt = DESCC[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
287 
288  Afwd = ( Mupcase( DIRECA[0] ) == CFORWARD );
289  Cfwd = ( Mupcase( DIRECC[0] ) == CFORWARD );
290  nota = ( ( TrA = Mupcase( TRANSA[0] ) ) == CNOTRAN );
291  notb = ( ( TrB = Mupcase( TRANSB[0] ) ) == CNOTRAN );
292 
293  size = TYPE->size; one = TYPE->one; zero = TYPE->zero;
294  gemm = TYPE->Fgemm; gsum2d = TYPE->Cgsum2d;
295  mb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) );
296 /*
297 * Compute local information for sub( A ), sub( B ) and sub( C )
298 */
299  if( nota )
300  {
301  AiD = JA; AiR = IA;
302  Asrc_ = RSRC_; Aroc = CROW;
303  AinbR = DESCA[IMB_ ]; AinbD = DESCA[INB_];
304  AnbR = DESCA[MB_ ]; AnbD = DESCA[NB_ ];
305  AsrcR = DESCA[Asrc_]; Ald = DESCA[LLD_];
306  AmyprocD = mycol; AnprocsD = npcol;
307  AmyprocR = myrow; AnprocsR = nprow;
308  PB_Cinfog2l( IA, JA, DESCA, AnprocsR, AnprocsD, AmyprocR, AmyprocD,
309  &AiiR, &AiiD, &ArocR, &ArocD );
310  }
311  else
312  {
313  AiD = IA; AiR = JA;
314  Asrc_ = CSRC_; Aroc = CCOLUMN;
315  AinbD = DESCA[IMB_ ]; AinbR = DESCA[INB_];
316  AnbD = DESCA[MB_ ]; AnbR = DESCA[NB_ ];
317  AsrcR = DESCA[Asrc_]; Ald = DESCA[LLD_];
318  AmyprocD = myrow; AnprocsD = nprow;
319  AmyprocR = mycol; AnprocsR = npcol;
320  PB_Cinfog2l( IA, JA, DESCA, AnprocsD, AnprocsR, AmyprocD, AmyprocR,
321  &AiiD, &AiiR, &ArocD, &ArocR );
322  }
323  Ainb1D = PB_Cfirstnb( K, AiD, AinbD, AnbD );
324  AnpD = PB_Cnumroc( K, 0, Ainb1D, AnbD, AmyprocD, ArocD, AnprocsD );
325  Ainb1R = PB_Cfirstnb( M, AiR, AinbR, AnbR );
326 
327  Cimb = DESCC[IMB_ ]; Cinb = DESCC[INB_];
328  Cmb = DESCC[MB_ ]; Cnb = DESCC[NB_ ];
329  Csrc = DESCC[RSRC_]; Cld = DESCC[LLD_];
330  PB_Cinfog2l( IC, JC, DESCC, nprow, npcol, myrow, mycol, &Cii, &Cjj,
331  &Crow, &Ccol );
332  Cimb1 = PB_Cfirstnb( M, IC, Cimb, Cmb );
333  Cinb1 = PB_Cfirstnb( N, JC, Cinb, Cnb );
334  Cnq = PB_Cnumroc( N, 0, Cinb1, Cnb, mycol, Ccol, npcol );
335 /*
336 * Retrieve the BLACS combine topology, compute conjugate of alpha for the
337 * conjugate transpose case and set the transpose parameters to be passed to
338 * the BLAS matrix multiply routine.
339 */
340  if( notb )
341  {
342  Bm = K; Bn = N;
343  top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
344  talpha = ALPHA; GemmTa = ( nota ? CTRAN : TrA ); GemmTb = CNOTRAN;
345  }
346  else
347  {
348  Bm = N; Bn = K;
349  top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
350  if( TrB == CCOTRAN )
351  {
352  talpha = PB_Cmalloc( size ); PB_Cconjg( TYPE, ALPHA, talpha );
353  GemmTb = ( ( TrA == CCOTRAN ) ? CTRAN : CCOTRAN );
354  }
355  else
356  {
357  talpha = ALPHA;
358  GemmTb = ( ( TrA == CCOTRAN ) ? CCOTRAN : CTRAN );
359  }
360  GemmTa = CNOTRAN;
361  }
362 /*
363 * Compute descriptor Bd0 for sub( B )
364 */
365  PB_Cdescribe( Bm, Bn, IB, JB, DESCB, nprow, npcol, myrow, mycol, &Bii, &Bjj,
366  &Bld, &Bimb1, &Binb1, &Bmb, &Bnb, &Brow, &Bcol, Bd0 );
367 
368  Bmp = PB_Cnumroc( Bm, 0, Bimb1, Bmb, myrow, Brow, nprow );
369  Bnq = PB_Cnumroc( Bn, 0, Binb1, Bnb, mycol, Bcol, npcol );
370  if( ( Bmp > 0 ) && ( Bnq > 0 ) ) Bptr = Mptr( B, Bii, Bjj, Bld, size );
371 /*
372 * When sub( A ) is not replicated and backward pass on sub( A ), find the
373 * virtual process q owning the last row or column of sub( A ).
374 */
375  if( !( AisR = ( ( AsrcR < 0 ) || ( AnprocsR == 1 ) ) ) && !Afwd )
376  {
377  tmp = PB_Cindxg2p( M - 1, Ainb1R, AnbR, ArocR, ArocR, AnprocsR );
378  q = MModSub( tmp, ArocR, AnprocsR );
379  }
380 /*
381 * When sub( C ) is not replicated and backward pass on sub( C ), find the
382 * virtual process p owning the last row or column of sub( C ).
383 */
384  if( !( CisR = ( ( Crow < 0 ) || ( nprow == 1 ) ) ) && !Cfwd )
385  {
386  tmp = PB_Cindxg2p( M - 1, Cimb1, Cmb, Crow, Crow, nprow );
387  p = MModSub( tmp, Crow, nprow );
388  }
389 /*
390 * Loop over the virtual process grid induced by the rows or columns of
391 * sub( A ) and sub( C ).
392 */
393  lcmb = PB_Clcm( ( maxp = ( CisR ? 1 : nprow ) ) * Cmb,
394  ( maxq = ( AisR ? 1 : AnprocsR ) ) * AnbR );
395  m = M;
396  maxpm1 = maxp - 1;
397 
398  while( m > 0 )
399  {
400 /*
401 * Initialize local virtual matrix in process (p,q)
402 */
403  AcurrocR = ( AisR ? -1 : MModAdd( ArocR, q, AnprocsR ) );
404  Akk = PB_Cg2lrem( AiR, AinbR, AnbR, AcurrocR, AsrcR, AnprocsR );
405  AnpR = PB_Cnumroc( M, 0, Ainb1R, AnbR, AcurrocR, ArocR, AnprocsR );
406 
407  Ccurrow = ( CisR ? -1 : MModAdd( Crow, p, nprow ) );
408  Ckk = PB_Cg2lrem( IC, Cimb, Cmb, Ccurrow, Csrc, nprow );
409  Cmp = PB_Cnumroc( M, 0, Cimb1, Cmb, Ccurrow, Crow, nprow );
410 
411  PB_CVMinit( &VM, 0, Cmp, AnpR, Cimb1, Ainb1R, Cmb, AnbR, p, q,
412  maxp, maxq, lcmb );
413 /*
414 * Find how many diagonals in this virtual process
415 */
416  npq = PB_CVMnpq( &VM );
417 
418  m -= npq;
419 /*
420 * Re-adjust the number of rows or columns to be (un)packed, in order to
421 * average the message sizes.
422 */
423  if( npq ) mbb = npq / ( ( npq - 1 ) / mb + 1 );
424 
425  while( npq )
426  {
427  mbb = MIN( mbb, npq );
428 /*
429 * Find out how many rows or columns of sub( A ) and sub( C ) are contiguous
430 */
431  PB_CVMcontig( &VM, &nrpq, &ncpq, &Coff, &Aoff );
432 
433  if( nota )
434  {
435 /*
436 * Compute the descriptor DBUFA for the buffer that will contained the packed
437 * columns of sub( A ).
438 */
439  if( ( Afr = ( ncpq < mbb ) ) != 0 )
440  {
441 /*
442 * If rows of sub( A ) are not contiguous, then allocate the buffer and
443 * pack the mbb rows of sub( A ).
444 */
445  Abufld = mbb;
446  if( AisR || ( AmyprocR == AcurrocR ) )
447  {
448  Abuf = PB_Cmalloc( AnpD * mbb * size );
449  PB_CVMpack( TYPE, &VM, COLUMN, &Aroc, PACKING, NOTRAN, mbb,
450  AnpD, one, Mptr( A, Akk, AiiD, Ald, size ), Ald,
451  zero, Abuf, Abufld );
452  }
453  }
454  else
455  {
456 /*
457 * Otherwise, re-use sub( B ) directly.
458 */
459  Abufld = Ald;
460  if( AisR || ( AmyprocR == AcurrocR ) )
461  Abuf = Mptr( A, Akk+Aoff, AiiD, Ald, size );
462  }
463  PB_Cdescset( DBUFA, mbb, K, mbb, Ainb1D, mbb, AnbD, AcurrocR,
464  ArocD, ctxt, Abufld );
465  }
466  else
467  {
468 /*
469 * Compute the descriptor DBUFA for the buffer that will contained the packed
470 * columns of sub( A ).
471 */
472  if( ( Afr = ( ncpq < mbb ) ) != 0 )
473  {
474 /*
475 * If columns of sub( A ) are not contiguous, then allocate the buffer and pack
476 * the mbb columns of sub( A ).
477 */
478  Abufld = MAX( 1, AnpD );
479  if( AisR || ( AmyprocR == AcurrocR ) )
480  {
481  Abuf = PB_Cmalloc( AnpD * mbb * size );
482  PB_CVMpack( TYPE, &VM, COLUMN, &Aroc, PACKING, NOTRAN, mbb,
483  AnpD, one, Mptr( A, AiiD, Akk, Ald, size ), Ald,
484  zero, Abuf, Abufld );
485  }
486  }
487  else
488  {
489 /*
490 * Otherwise, re-use sub( A ) directly.
491 */
492  Abufld = Ald;
493  if( AisR || ( AmyprocR == AcurrocR ) )
494  Abuf = Mptr( A, AiiD, Akk+Aoff, Ald, size );
495  }
496  PB_Cdescset( DBUFA, K, mbb, Ainb1D, mbb, AnbD, mbb, ArocD,
497  AcurrocR, ctxt, Abufld );
498  }
499 
500  if( notb )
501  {
502 /*
503 * Replicate this panel of rows or columns of sub( A ) over sub( B ) -> WA
504 */
505  PB_CInV( TYPE, NOCONJG, COLUMN, Bm, Bn, Bd0, mbb, Abuf, 0, 0,
506  DBUFA, &Aroc, &WA, WAd, &WAfr );
507 /*
508 * Allocate space for temporary results in scope of sub( B ) -> WC
509 */
510  PB_COutV( TYPE, ROW, INIT, Bm, Bn, Bd0, mbb, &WC, WCd, &WCfr,
511  &WCsum );
512 /*
513 * Local matrix-matrix multiply iff I own some data
514 */
515  if( Bmp > 0 && Bnq > 0 )
516  gemm( C2F_CHAR( &GemmTa ), C2F_CHAR( &GemmTb ), &mbb, &Bnq, &Bmp,
517  talpha, WA, &WAd[LLD_], Bptr, &Bld, zero, WC, &WCd[LLD_] );
518  if( WAfr ) free( WA );
519  if( Afr && ( AisR || ( AmyprocR == AcurrocR ) ) )
520  if( Abuf ) free( Abuf );
521 /*
522 * Accumulate the intermediate results in WC
523 */
524  if( WCsum )
525  {
526  WCd[RSRC_] = Ccurrow;
527  if( Bnq > 0 )
528  gsum2d( ctxt, COLUMN, &top, mbb, Bnq, WC, WCd[LLD_],
529  WCd[RSRC_], mycol );
530  }
531 /*
532 * Compute the descriptor DBUFC for the buffer that will contained the packed
533 * rows of sub( C ). Allocate it.
534 */
535  if( ( Cfr = ( nrpq < mbb ) ) != 0 )
536  {
537 /*
538 * If rows of sub( C ) are not contiguous, then allocate the buffer
539 */
540  Cbufld = mbb; tbeta = zero;
541  if( CisR || ( myrow == Ccurrow ) )
542  Cbuf = PB_Cmalloc( Cnq * mbb * size );
543  }
544  else
545  {
546 /*
547 * Otherwise re-use sub( C )
548 */
549  Cbufld = Cld; tbeta = BETA;
550  if( CisR || ( myrow == Ccurrow ) )
551  Cbuf = Mptr( C, Ckk+Coff, Cjj, Cld, size );
552  }
553  PB_Cdescset( DBUFC, mbb, N, mbb, Cinb1, mbb, Cnb, Ccurrow, Ccol,
554  ctxt, Cbufld );
555 /*
556 * Cbuf := Cbuf + WC
557 */
558  PB_Cpaxpby( TYPE, NOCONJG, mbb, N, one, WC, 0, 0, WCd, ROW, tbeta,
559  Cbuf, 0, 0, DBUFC, ROW );
560 /*
561 * Unpack the mbb rows of sub( C ) and release the buffer containing them.
562 */
563  if( Cfr && ( CisR || ( myrow == Ccurrow ) ) )
564  {
565  PB_CVMpack( TYPE, &VM, ROW, ROW, UNPACKING, NOTRAN, mbb, Cnq,
566  BETA, Mptr( C, Ckk, Cjj, Cld, size ), Cld, one, Cbuf,
567  Cbufld );
568  if( Cbuf ) free( Cbuf );
569  }
570  if( WCfr ) free( WC );
571  }
572  else
573  {
574 /*
575 * Replicate this panel of rows or columns of sub( A ) over sub( B ) -> WA
576 */
577  PB_CInV( TYPE, NOCONJG, ROW, Bm, Bn, Bd0, mbb, Abuf, 0, 0,
578  DBUFA, &Aroc, &WA, WAd, &WAfr );
579 /*
580 * Allocate space for temporary results in scope of sub( A ) -> WC
581 */
582  PB_COutV( TYPE, COLUMN, INIT, Bm, Bn, Bd0, mbb, &WC, WCd, &WCfr,
583  &WCsum );
584 /*
585 * Local matrix-matrix multiply iff I own some data
586 */
587  if( Bmp > 0 && Bnq > 0 )
588  gemm( C2F_CHAR( &GemmTa ), C2F_CHAR( &GemmTb ), &Bmp, &mbb, &Bnq,
589  talpha, Bptr, &Bld, WA, &WAd[LLD_], zero, WC, &WCd[LLD_] );
590  if( WAfr ) free( WA );
591  if( Afr && ( AisR || ( AmyprocR == AcurrocR ) ) )
592  if( Abuf ) free( Abuf );
593 /*
594 * Accumulate the intermediate results in WC
595 */
596  if( WCsum )
597  {
598  WCd[CSRC_] = 0;
599  if( Bmp > 0 )
600  gsum2d( ctxt, ROW, &top, Bmp, mbb, WC, WCd[LLD_], myrow,
601  WCd[CSRC_] );
602  }
603 /*
604 * Compute the descriptor DBUFC for the buffer that will contained the packed
605 * rows of sub( C ). Allocate it.
606 */
607  if( ( Cfr = ( nrpq < mbb ) ) != 0 )
608  {
609 /*
610 * If rows of sub( C ) are not contiguous, then allocate the buffer
611 */
612  Cbufld = mbb; tbeta = zero;
613  if( CisR || ( myrow == Ccurrow ) )
614  Cbuf = PB_Cmalloc( Cnq * mbb * size );
615  }
616  else
617  {
618 /*
619 * Otherwise re-use sub( C )
620 */
621  Cbufld = Cld; tbeta = BETA;
622  if( CisR || ( myrow == Ccurrow ) )
623  Cbuf = Mptr( C, Ckk+Coff, Cjj, Cld, size );
624  }
625  PB_Cdescset( DBUFC, mbb, N, mbb, Cinb1, mbb, Cnb, Ccurrow, Ccol,
626  ctxt, Cbufld );
627 /*
628 * Cbuf := Cbuf + WC'
629 */
630  PB_Cpaxpby( TYPE, ( TrB == CCOTRAN ? CONJG : NOCONJG ), N, mbb,
631  one, WC, 0, 0, WCd, COLUMN, tbeta, Cbuf, 0, 0, DBUFC,
632  ROW );
633 /*
634 * Unpack the mbb rows of sub( C ) and release the buffer containing them.
635 */
636  if( Cfr && ( CisR || ( myrow == Ccurrow ) ) )
637  {
638  PB_CVMpack( TYPE, &VM, ROW, ROW, UNPACKING, NOTRAN, mbb, Cnq,
639  BETA, Mptr( C, Ckk, Cjj, Cld, size ), Cld, one, Cbuf,
640  Cbufld );
641  if( Cbuf ) free( Cbuf );
642  }
643  if( WCfr ) free( WC );
644  }
645 /*
646 * Update the local indexes of sub( B ) and sub( C )
647 */
648  PB_CVMupdate( &VM, mbb, &Ckk, &Akk );
649 
650  npq -= mbb;
651  }
652 /*
653 * Go to next or previous virtual process row or column
654 */
655  if( ( Cfwd && ( p == maxpm1 ) ) ||
656  ( !( Cfwd ) && ( p == 0 ) ) )
657  q = ( Afwd ? MModAdd1( q, maxq ) : MModSub1( q, maxq ) );
658  p = ( Cfwd ? MModAdd1( p, maxp ) : MModSub1( p, maxp ) );
659  }
660 
661  if( TrB == CCOTRAN ) free( talpha );
662 /*
663 * End of PB_CpgemmAC
664 */
665 }
TYPE
#define TYPE
Definition: clamov.c:7
ROW
#define ROW
Definition: PBblacs.h:46
MB_
#define MB_
Definition: PBtools.h:43
PB_Cpaxpby
void PB_Cpaxpby()
PB_CVMcontig
void PB_CVMcontig()
NB_
#define NB_
Definition: PBtools.h:44
COLUMN
#define COLUMN
Definition: PBblacs.h:45
CSRC_
#define CSRC_
Definition: PBtools.h:46
UNPACKING
#define UNPACKING
Definition: PBtools.h:54
PB_Cg2lrem
int PB_Cg2lrem()
PB_Cconjg
void PB_Cconjg()
PB_CpgemmAC
void PB_CpgemmAC(PBTYP_T *TYPE, char *DIRECA, char *DIRECC, char *TRANSA, char *TRANSB, int M, int N, int K, char *ALPHA, char *A, int IA, int JA, int *DESCA, char *B, int IB, int JB, int *DESCB, char *BETA, char *C, int IC, int JC, int *DESCC)
Definition: PB_CpgemmAC.c:26
CCOTRAN
#define CCOTRAN
Definition: PBblas.h:22
NOCONJG
#define NOCONJG
Definition: PBblas.h:45
PB_Cfirstnb
int PB_Cfirstnb()
DLEN_
#define DLEN_
Definition: PBtools.h:48
GEMM_T
F_VOID_FCT(* GEMM_T)()
Definition: pblas.h:313
NOTRAN
#define NOTRAN
Definition: PBblas.h:44
LLD_
#define LLD_
Definition: PBtools.h:47
PB_Cdescribe
void PB_Cdescribe()
PB_CVMinit
void PB_CVMinit()
PB_CVMpack
int PB_CVMpack()
MModAdd
#define MModAdd(I1, I2, d)
Definition: PBtools.h:97
CROW
#define CROW
Definition: PBblacs.h:21
IMB_
#define IMB_
Definition: PBtools.h:41
MModSub
#define MModSub(I1, I2, d)
Definition: PBtools.h:102
pilaenv_
int pilaenv_()
INIT
#define INIT
Definition: PBblas.h:61
PB_Cdescset
void PB_Cdescset()
PB_CVMupdate
void PB_CVMupdate()
MModAdd1
#define MModAdd1(I, d)
Definition: PBtools.h:100
PB_Cindxg2p
int PB_Cindxg2p()
TOP_GET
#define TOP_GET
Definition: PBblacs.h:50
GSUM2D_T
void(* GSUM2D_T)()
Definition: pblas.h:282
PB_Ctop
char * PB_Ctop()
RSRC_
#define RSRC_
Definition: PBtools.h:45
CNOTRAN
#define CNOTRAN
Definition: PBblas.h:18
COMBINE
#define COMBINE
Definition: PBblacs.h:49
CONJG
#define CONJG
Definition: PBblas.h:47
PB_Cinfog2l
void PB_Cinfog2l()
PB_Cnumroc
int PB_Cnumroc()
PB_Cmalloc
char * PB_Cmalloc()
CFORWARD
#define CFORWARD
Definition: PBblas.h:38
PB_CInV
void PB_CInV()
PB_VM_T
Definition: pblas.h:432
PB_CVMnpq
int PB_CVMnpq()
MIN
#define MIN(a_, b_)
Definition: PBtools.h:76
CCOLUMN
#define CCOLUMN
Definition: PBblacs.h:20
PACKING
#define PACKING
Definition: PBtools.h:53
INB_
#define INB_
Definition: PBtools.h:42
PB_COutV
void PB_COutV()
C2F_CHAR
#define C2F_CHAR(a)
Definition: pblas.h:121
MModSub1
#define MModSub1(I, d)
Definition: PBtools.h:105
MAX
#define MAX(a_, b_)
Definition: PBtools.h:77
Cblacs_gridinfo
void Cblacs_gridinfo()
PBTYP_T
Definition: pblas.h:325
Mupcase
#define Mupcase(C)
Definition: PBtools.h:83
CTRAN
#define CTRAN
Definition: PBblas.h:20
Mptr
#define Mptr(a_, i_, j_, lda_, siz_)
Definition: PBtools.h:132
CTXT_
#define CTXT_
Definition: PBtools.h:38
PB_Clcm
int PB_Clcm()