SCALAPACK 2.2.2
LAPACK: Linear Algebra PACKage
Loading...
Searching...
No Matches
PB_CptrmmAB.c
Go to the documentation of this file.
1/* ---------------------------------------------------------------------
2*
3* -- PBLAS auxiliary routine (version 2.0) --
4* University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5* and University of California, Berkeley.
6* April 1, 1998
7*
8* ---------------------------------------------------------------------
9*/
10/*
11* Include files
12*/
13#include "../pblas.h"
14#include "../PBpblas.h"
15#include "../PBtools.h"
16#include "../PBblacs.h"
17#include "../PBblas.h"
18
19#ifdef __STDC__
20void PB_CptrmmAB( PBTYP_T * TYPE, char * VARIANT, char * SIDE, char * UPLO,
21 char * TRANSA, char * DIAG, Int M, Int N, char * ALPHA,
22 char * A, Int IA, Int JA, Int * DESCA, char * B, Int IB,
23 Int JB, Int * DESCB )
24#else
25void PB_CptrmmAB( TYPE, VARIANT, SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, IA,
26 JA, DESCA, B, IB, JB, DESCB )
27/*
28* .. Scalar Arguments ..
29*/
30 char * DIAG, * SIDE, * TRANSA, * UPLO, * VARIANT;
31 Int IA, IB, JA, JB, M, N;
32 char * ALPHA;
33 PBTYP_T * TYPE;
34/*
35* .. Array Arguments ..
36*/
37 Int * DESCA, * DESCB;
38 char * A, * B;
39#endif
40{
41/*
42* Purpose
43* =======
44*
45* PB_CptrmmAB performs one of the matrix-matrix operations
46*
47* sub( B ) := alpha * op( sub( A ) ) * sub( B ),
48*
49* or
50*
51* sub( B ) := alpha * sub( B ) * op( sub( A ) ),
52*
53* where
54*
55* sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1) if SIDE = 'L',
56* A(IA:IA+N-1,JA:JA+N-1) if SIDE = 'R', and,
57*
58* sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1).
59*
60* Alpha is a scalar, sub( B ) is an m by n submatrix, sub( A ) is a
61* unit, or non-unit, upper or lower triangular submatrix and op( X ) is
62* one of
63*
64* op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ).
65*
66* This is the outer-product algorithm using the logical aggregation
67* blocking technique.
68*
69* Notes
70* =====
71*
72* A description vector is associated with each 2D block-cyclicly dis-
73* tributed matrix. This vector stores the information required to
74* establish the mapping between a matrix entry and its corresponding
75* process and memory location.
76*
77* In the following comments, the character _ should be read as
78* "of the distributed matrix". Let A be a generic term for any 2D
79* block cyclicly distributed matrix. Its description vector is DESC_A:
80*
81* NOTATION STORED IN EXPLANATION
82* ---------------- --------------- ------------------------------------
83* DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
84* CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
85* the NPROW x NPCOL BLACS process grid
86* A is distributed over. The context
87* itself is global, but the handle
88* (the integer value) may vary.
89* M_A (global) DESCA[ M_ ] The number of rows in the distribu-
90* ted matrix A, M_A >= 0.
91* N_A (global) DESCA[ N_ ] The number of columns in the distri-
92* buted matrix A, N_A >= 0.
93* IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
94* block of the matrix A, IMB_A > 0.
95* INB_A (global) DESCA[ INB_ ] The number of columns of the upper
96* left block of the matrix A,
97* INB_A > 0.
98* MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
99* bute the last M_A-IMB_A rows of A,
100* MB_A > 0.
101* NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
102* bute the last N_A-INB_A columns of
103* A, NB_A > 0.
104* RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
105* row of the matrix A is distributed,
106* NPROW > RSRC_A >= 0.
107* CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
108* first column of A is distributed.
109* NPCOL > CSRC_A >= 0.
110* LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
111* array storing the local blocks of
112* the distributed matrix A,
113* IF( Lc( 1, N_A ) > 0 )
114* LLD_A >= MAX( 1, Lr( 1, M_A ) )
115* ELSE
116* LLD_A >= 1.
117*
118* Let K be the number of rows of a matrix A starting at the global in-
119* dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
120* that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
121* receive if these K rows were distributed over NPROW processes. If K
122* is the number of columns of a matrix A starting at the global index
123* JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
124* lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
125* these K columns were distributed over NPCOL processes.
126*
127* The values of Lr() and Lc() may be determined via a call to the func-
128* tion PB_Cnumroc:
129* Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
130* Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
131*
132* Arguments
133* =========
134*
135* TYPE (local input) pointer to a PBTYP_T structure
136* On entry, TYPE is a pointer to a structure of type PBTYP_T,
137* that contains type information (See pblas.h).
138*
139* VARIANT (global input) pointer to CHAR
140* On entry, VARIANT specifies whether the left- or right-loo-
141* king variant of the algorithm should be used for the transpo-
142* se cases only, that is TRANSA is not 'N' or 'n'. When VARIANT
143* is 'L' or 'l', the left-looking variant is used, otherwise
144* the right-looking algorithm is selected.
145*
146* SIDE (global input) pointer to CHAR
147* On entry, SIDE specifies whether op( sub( A ) ) multiplies
148* sub( B ) from the left or right as follows:
149*
150* SIDE = 'L' or 'l' sub( B ) := alpha*op( sub( A ) )*sub( B ),
151*
152* SIDE = 'R' or 'r' sub( B ) := alpha*sub( B )*op( sub( A ) ).
153*
154* UPLO (global input) pointer to CHAR
155* On entry, UPLO specifies whether the submatrix sub( A ) is
156* an upper or lower triangular submatrix as follows:
157*
158* UPLO = 'U' or 'u' sub( A ) is an upper triangular
159* submatrix,
160*
161* UPLO = 'L' or 'l' sub( A ) is a lower triangular
162* submatrix.
163*
164* TRANSA (global input) pointer to CHAR
165* On entry, TRANSA specifies the form of op( sub( A ) ) to be
166* used in the matrix multiplication as follows:
167*
168* TRANSA = 'N' or 'n' op( sub( A ) ) = sub( A ),
169*
170* TRANSA = 'T' or 't' op( sub( A ) ) = sub( A )',
171*
172* TRANSA = 'C' or 'c' op( sub( A ) ) = conjg( sub( A )' ).
173*
174* DIAG (global input) pointer to CHAR
175* On entry, DIAG specifies whether or not sub( A ) is unit
176* triangular as follows:
177*
178* DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian-
179* gular,
180*
181* DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri-
182* angular.
183*
184* M (global input) INTEGER
185* On entry, M specifies the number of rows of the submatrix
186* sub( B ). M must be at least zero.
187*
188* N (global input) INTEGER
189* On entry, N specifies the number of columns of the submatrix
190* sub( B ). N must be at least zero.
191*
192* ALPHA (global input) pointer to CHAR
193* On entry, ALPHA specifies the scalar alpha. When ALPHA is
194* supplied as zero then the local entries of the array B
195* corresponding to the entries of the submatrix sub( B ) need
196* not be set on input.
197*
198* A (local input) pointer to CHAR
199* On entry, A is an array of dimension (LLD_A, Ka), where Ka is
200* at least Lc( 1, JA+M-1 ) when SIDE = 'L' or 'l' and is at
201* least Lc( 1, JA+N-1 ) otherwise. Before entry, this array
202* contains the local entries of the matrix A.
203* Before entry with UPLO = 'U' or 'u', this array contains the
204* local entries corresponding to the entries of the upper tri-
205* angular submatrix sub( A ), and the local entries correspon-
206* ding to the entries of the strictly lower triangular part of
207* the submatrix sub( A ) are not referenced.
208* Before entry with UPLO = 'L' or 'l', this array contains the
209* local entries corresponding to the entries of the lower tri-
210* angular submatrix sub( A ), and the local entries correspon-
211* ding to the entries of the strictly upper triangular part of
212* the submatrix sub( A ) are not referenced.
213* Note that when DIAG = 'U' or 'u', the local entries corres-
214* ponding to the diagonal elements of the submatrix sub( A )
215* are not referenced either, but are assumed to be unity.
216*
217* IA (global input) INTEGER
218* On entry, IA specifies A's global row index, which points to
219* the beginning of the submatrix sub( A ).
220*
221* JA (global input) INTEGER
222* On entry, JA specifies A's global column index, which points
223* to the beginning of the submatrix sub( A ).
224*
225* DESCA (global and local input) INTEGER array
226* On entry, DESCA is an integer array of dimension DLEN_. This
227* is the array descriptor for the matrix A.
228*
229* B (local input/local output) pointer to CHAR
230* On entry, B is an array of dimension (LLD_B, Kb), where Kb is
231* at least Lc( 1, JB+N-1 ). Before entry, this array contains
232* the local entries of the matrix B.
233* On exit, the local entries of this array corresponding to the
234* to the entries of the submatrix sub( B ) are overwritten by
235* the local entries of the m by n transformed submatrix.
236*
237* IB (global input) INTEGER
238* On entry, IB specifies B's global row index, which points to
239* the beginning of the submatrix sub( B ).
240*
241* JB (global input) INTEGER
242* On entry, JB specifies B's global column index, which points
243* to the beginning of the submatrix sub( B ).
244*
245* DESCB (global and local input) INTEGER array
246* On entry, DESCB is an integer array of dimension DLEN_. This
247* is the array descriptor for the matrix B.
248*
249* -- Written on April 1, 1998 by
250* Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
251*
252* ---------------------------------------------------------------------
253*/
254/*
255* .. Local Scalars ..
256*/
257 char conjg, * one, top, * zero;
258 Int Afr, Bcol, Bcurcol, Bcurimb1, Bcurinb1, Bcurrow, Bfr, Bii,
259 Bimb, Bimb1, Binb, Binb1, Bjj, Bld, Bmb, Bmp, Bmp0, Bnb, Bnq,
260 Bnq0, Brow, WAfr, WBfr, WBsum, ctxt, k, kb, kbb, ktmp, lside,
261 mycol, myrow, notran, npcol, nprow, size, unit, upper;
262/*
263* .. Local Arrays ..
264*/
265 Int Bd0[DLEN_], DBUFA[DLEN_], DBUFB[DLEN_], WAd[DLEN_],
266 WBd[DLEN_];
267 char * Aptr = NULL, * Bptr = NULL, * Bptr0 = NULL, * WA = NULL,
268 * WB = NULL;
269 GEMM_T gemm;
270 GSUM2D_T gsum2d;
271/* ..
272* .. Executable Statements ..
273*
274*/
275 lside = ( Mupcase( SIDE [0] ) == CLEFT );
276 upper = ( Mupcase( UPLO [0] ) == CUPPER );
277 notran = ( Mupcase( TRANSA[0] ) == CNOTRAN );
278 unit = ( Mupcase( DIAG [0] ) == CUNIT );
279 size = TYPE->size; one = TYPE->one; zero = TYPE->zero;
280 gsum2d = TYPE->Cgsum2d; gemm = TYPE->Fgemm;
281 kb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) );
282/*
283* Retrieve process grid information
284*/
285 Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
286/*
287* Retrieve sub( B )'s local information: Bii, Bjj, Brow, Bcol, Bld ...
288*/
289 Bimb = DESCB[IMB_]; Binb = DESCB[INB_];
290 Bmb = DESCB[MB_ ]; Bnb = DESCB[NB_ ]; Bld = DESCB[LLD_];
291 PB_Cinfog2l( IB, JB, DESCB, nprow, npcol, myrow, mycol, &Bii, &Bjj, &Brow,
292 &Bcol );
293 Bimb1 = PB_Cfirstnb( M, IB, Bimb, Bmb );
294 Bmp0 = PB_Cnumroc( M, 0, Bimb1, Bmb, myrow, Brow, nprow );
295 Binb1 = PB_Cfirstnb( N, JB, Binb, Bnb );
296 Bnq0 = PB_Cnumroc( N, 0, Binb1, Bnb, mycol, Bcol, npcol );
297 if( ( Bmp0 > 0 ) && ( Bnq0 > 0 ) ) Bptr0 = Mptr( B, Bii, Bjj, Bld, size );
298
299 if( notran )
300 {
301 if( lside )
302 {
303 if( upper )
304 {
305 for( k = 0; k < M; k += kb )
306 {
307 kbb = M - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
308/*
309* Accumulate A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 )
310*/
311 PB_CGatherV( TYPE, ALLOCATE, FORWARD, ktmp, kbb, A, IA, JA+k,
312 DESCA, COLUMN, &Aptr, DBUFA, &Afr );
313/*
314* Replicate WA := A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 ) over
315* B( IB:IB+k+kbb-1, JB:JB+N-1 )
316*/
317 PB_Cdescset( Bd0, ktmp, N, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
318 ctxt, Bld );
319 PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0,
320 DBUFA, COLUMN, &WA, WAd, &WAfr );
321/*
322* Zero lower triangle of WA( k:k+kbb-1, 0:kbb-1 )
323*/
324 if( unit )
325 PB_Cplapad( TYPE, LOWER, NOCONJG, kbb, kbb, zero, one,
326 WA, k, 0, WAd );
327 else if( kbb > 1 )
328 PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero,
329 WA, k+1, 0, WAd );
330/*
331* Accumulate B( IB+k:IB+k+kbb-1, JB:JB+N-1 )
332*/
333 PB_CGatherV( TYPE, ALLOCATE, FORWARD, kbb, N, B, IB+k, JB, DESCB,
334 ROW, &Bptr, DBUFB, &Bfr );
335/*
336* Replicate WB := B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) over
337* B( IB:IB+k+kbb-1, JB:JB+N-1)
338*/
339 PB_CInV( TYPE, NOCONJG, ROW, ktmp, N, Bd0, kbb, Bptr, 0, 0,
340 DBUFB, ROW, &WB, WBd, &WBfr );
341/*
342* Zero B( IB+k:IB+k+kbb-1, JB:JB+N-1 )
343*/
344 PB_Cplapad( TYPE, ALL, NOCONJG, kbb, N, zero, zero, B, IB+k, JB,
345 DESCB );
346/*
347* B( IB:IB+k+kbb-1, JB:JB+N-1 ) := ALPHA * WA * WB
348*/
349 Bmp = PB_Cnumroc( ktmp, 0, Bimb1, Bmb, myrow, Brow, nprow );
350 if( ( Bmp > 0 ) && ( Bnq0 > 0 ) )
351 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp, &Bnq0,
352 &kbb, ALPHA, WA, &WAd[LLD_], WB, &WBd[LLD_], one, Bptr0,
353 &Bld );
354
355 if( WBfr ) free( WB );
356 if( WAfr ) free( WA );
357 if( Bfr ) free( Bptr );
358 if( Afr ) free( Aptr );
359 }
360 }
361 else
362 {
363 for( k = ( ( M - 1 ) / kb ) * kb; k >= 0; k -= kb )
364 {
365 ktmp = M - k; kbb = MIN( ktmp, kb );
366/*
367* Accumulate A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 )
368*/
369 PB_CGatherV( TYPE, ALLOCATE, BACKWARD, ktmp, kbb, A, IA+k, JA+k,
370 DESCA, COLUMN, &Aptr, DBUFA, &Afr );
371/*
372* Replicate WA := A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 ) over
373* B( IB+k:IB+M-1, JB:JB+N-1 )
374*/
375 Bcurimb1 = PB_Cfirstnb( ktmp, IB+k, Bimb, Bmb );
376 Bcurrow = PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow, nprow );
377 PB_Cdescset( Bd0, ktmp, N, Bcurimb1, Binb1, Bmb, Bnb, Bcurrow,
378 Bcol, ctxt, Bld );
379 PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0,
380 DBUFA, COLUMN, &WA, WAd, &WAfr );
381/*
382* Zero upper triangle of WA( 0:kbb-1, 0:kbb-1 )
383*/
384 if( unit )
385 PB_Cplapad( TYPE, UPPER, NOCONJG, kbb, kbb, zero, one,
386 WA, 0, 0, WAd );
387 else if( kbb > 1 )
388 PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero,
389 WA, 0, 1, WAd );
390/*
391* Accumulate B( IB+k:IB+k+kbb-1, JB:JB+N-1 )
392*/
393 PB_CGatherV( TYPE, ALLOCATE, BACKWARD, kbb, N, B, IB+k, JB,
394 DESCB, ROW, &Bptr, DBUFB, &Bfr );
395/*
396* Replicate WB := B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) over
397* B( IB+k:IB+M-1, JB:JB+N-1 )
398*/
399 PB_CInV( TYPE, NOCONJG, ROW, ktmp, N, Bd0, kbb, Bptr, 0, 0,
400 DBUFB, ROW, &WB, WBd, &WBfr );
401/*
402* Zero B( IB+k:IB+k+kbb-1, JB:JB+N-1 )
403*/
404 PB_Cplapad( TYPE, ALL, NOCONJG, kbb, N, zero, zero, B, IB+k, JB,
405 DESCB );
406/*
407* B( IB+k:IB+M-1, JB:JB+N-1 ) := ALPHA * WA * WB
408*/
409 Bmp = PB_Cnumroc( ktmp, k, Bimb1, Bmb, myrow, Brow, nprow );
410 if( ( Bmp > 0 ) && ( Bnq0 > 0 ) )
411 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp, &Bnq0,
412 &kbb, ALPHA, WA, &WAd[LLD_], WB, &WBd[LLD_], one,
413 Mptr( Bptr0, Bmp0-Bmp, 0, Bld, size ), &Bld );
414
415 if( WBfr ) free( WB );
416 if( WAfr ) free( WA );
417 if( Bfr ) free( Bptr );
418 if( Afr ) free( Aptr );
419 }
420 }
421 }
422 else
423 {
424 if( upper )
425 {
426 for( k = ( ( N - 1 ) / kb ) * kb; k >= 0; k -= kb )
427 {
428 ktmp = N - k; kbb = MIN( ktmp, kb );
429/*
430* Accumulate A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 )
431*/
432 PB_CGatherV( TYPE, ALLOCATE, BACKWARD, kbb, ktmp, A, IA+k, JA+k,
433 DESCA, ROW, &Aptr, DBUFA, &Afr );
434/*
435* Replicate WA := A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) over
436* B( IB:IB+M-1, JB+k:JB+N-1 )
437*/
438 Bcurinb1 = PB_Cfirstnb( ktmp, JB+k, Binb, Bnb );
439 Bcurcol = PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol, npcol );
440 PB_Cdescset( Bd0, M, ktmp, Bimb1, Bcurinb1, Bmb, Bnb, Brow,
441 Bcurcol, ctxt, Bld );
442 PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0,
443 DBUFA, ROW, &WA, WAd, &WAfr );
444/*
445* Zero lower triangle of WA( 0:kbb-1, 0:kbb-1 )
446*/
447 if( unit )
448 PB_Cplapad( TYPE, LOWER, NOCONJG, kbb, kbb, zero, one,
449 WA, 0, 0, WAd );
450 else if( kbb > 1 )
451 PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero,
452 WA, 1, 0, WAd );
453/*
454* Accumulate B( IB:IB+M-1, JB+k:JB+k+kbb-1 )
455*/
456 PB_CGatherV( TYPE, ALLOCATE, BACKWARD, M, kbb, B, IB, JB+k,
457 DESCB, COLUMN, &Bptr, DBUFB, &Bfr );
458/*
459* Replicate WB := B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) over
460* B( IB:IB+M-1, JB+k:JB+N-1 )
461*/
462 PB_CInV( TYPE, NOCONJG, COLUMN, M, ktmp, Bd0, kbb, Bptr, 0, 0,
463 DBUFB, COLUMN, &WB, WBd, &WBfr );
464/*
465* Zero B( IB:IB+M-1, JB+k:JB+k+kbb-1 )
466*/
467 PB_Cplapad( TYPE, ALL, NOCONJG, M, kbb, zero, zero, B, IB, JB+k,
468 DESCB );
469/*
470* B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) := ALPHA * WB * WA
471*/
472 Bnq = PB_Cnumroc( ktmp, k, Binb1, Bnb, mycol, Bcol, npcol );
473 if( ( Bmp0 > 0 ) && ( Bnq > 0 ) )
474 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp0, &Bnq,
475 &kbb, ALPHA, WB, &WBd[LLD_], WA, &WAd[LLD_], one,
476 Mptr( Bptr0, 0, Bnq0-Bnq, Bld, size ), &Bld );
477
478 if( WBfr ) free( WB );
479 if( WAfr ) free( WA );
480 if( Bfr ) free( Bptr );
481 if( Afr ) free( Aptr );
482 }
483 }
484 else
485 {
486 for( k = 0; k < N; k += kb )
487 {
488 kbb = N - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
489/*
490* Accumulate A( IA+k:IA+k+kbb-1, JA:JA+k+kbb-1 )
491*/
492 PB_CGatherV( TYPE, ALLOCATE, FORWARD, kbb, ktmp, A, IA+k, JA,
493 DESCA, ROW, &Aptr, DBUFA, &Afr );
494/*
495* Replicate WA := A( IA+k:IA+k+kbb-1, JA:JA+k+kbb-1 ) over
496* B( IB:IB+M-1, JB:JB+k+kbb-1 )
497*/
498 PB_Cdescset( Bd0, M, ktmp, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
499 ctxt, Bld );
500 PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0,
501 DBUFA, ROW, &WA, WAd, &WAfr );
502/*
503* Zero upper triangle of WA( 0:kbb-1, k:k+kbb-1 )
504*/
505 if( unit )
506 PB_Cplapad( TYPE, UPPER, NOCONJG, kbb, kbb, zero, one,
507 WA, 0, k, WAd );
508 else if( kbb > 1 )
509 PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero,
510 WA, 0, k+1, WAd );
511/*
512* Accumulate B( IB:IB+M-1, JB+k:JB+k+kbb-1 )
513*/
514 PB_CGatherV( TYPE, ALLOCATE, FORWARD, M, kbb, B, IB, JB+k, DESCB,
515 COLUMN, &Bptr, DBUFB, &Bfr );
516/*
517* Replicate WB := B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) over
518* B( IB:IB+M-1, JB:JB+k+kbb-1 )
519*/
520 PB_CInV( TYPE, NOCONJG, COLUMN, M, ktmp, Bd0, kbb, Bptr, 0, 0,
521 DBUFB, COLUMN, &WB, WBd, &WBfr );
522/*
523* Zero B( IB:IB+M-1, JB+k:JB+k+kbb-1 )
524*/
525 PB_Cplapad( TYPE, ALL, NOCONJG, M, kbb, zero, zero, B, IB, JB+k,
526 DESCB );
527/*
528* B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) := ALPHA * WB * WA
529*/
530 Bnq = PB_Cnumroc( ktmp, 0, Binb1, Bnb, mycol, Bcol, npcol );
531 if( ( Bmp0 > 0 ) && ( Bnq > 0 ) )
532 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp0, &Bnq,
533 &kbb, ALPHA, WB, &WBd[LLD_], WA, &WAd[LLD_], one, Bptr0,
534 &Bld );
535
536 if( WBfr ) free( WB );
537 if( WAfr ) free( WA );
538 if( Bfr ) free( Bptr );
539 if( Afr ) free( Aptr );
540 }
541 }
542 }
543 }
544 else
545 {
546 if( Mupcase( VARIANT[0] ) == CRIGHT )
547 {
548/*
549* Right looking variant for the transpose cases
550*/
551 conjg = ( ( Mupcase( TRANSA[0] ) == CCOTRAN ) ? CCONJG : CNOCONJG );
552
553 if( lside )
554 {
555 if( !upper )
556 {
557 for( k = 0; k < M; k += kb )
558 {
559 kbb = M - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
560/*
561* Accumulate A( IA+k:IA+k+kbb-1, JA:JA+k+kbb-1 )
562*/
563 PB_CGatherV( TYPE, ALLOCATE, FORWARD, kbb, ktmp, A, IA+k, JA,
564 DESCA, ROW, &Aptr, DBUFA, &Afr );
565/*
566* Replicate WA := A( IA+k:IA+k+kbb-1, JA:JA+k+kbb-1 )' over
567* B( IB:IB+k+kbb-1, JB:JB+N-1 )
568*/
569 PB_Cdescset( Bd0, ktmp, N, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
570 ctxt, Bld );
571 PB_CInV( TYPE, &conjg, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0,
572 DBUFA, ROW, &WA, WAd, &WAfr );
573/*
574* Zero lower triangle of WA( k:k+kbb-1, 0:kbb-1 )
575*/
576 if( unit )
577 PB_Cplapad( TYPE, LOWER, NOCONJG, kbb, kbb, zero, one,
578 WA, k, 0, WAd );
579 else if( kbb > 1 )
580 PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero,
581 WA, k+1, 0, WAd );
582/*
583* Accumulate B( IB+k:IB+k+kbb-1, JB:JB+N-1 )
584*/
585 PB_CGatherV( TYPE, ALLOCATE, FORWARD, kbb, N, B, IB+k, JB,
586 DESCB, ROW, &Bptr, DBUFB, &Bfr );
587/*
588* Replicate WB := B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) over
589* B( IB:IB+k+kbb-1, JB:JB+N-1)
590*/
591 PB_CInV( TYPE, NOCONJG, ROW, ktmp, N, Bd0, kbb, Bptr, 0, 0,
592 DBUFB, ROW, &WB, WBd, &WBfr );
593/*
594* Zero B( IB+k:IB+k+kbb-1, JB:JB+N-1 )
595*/
596 PB_Cplapad( TYPE, ALL, NOCONJG, kbb, N, zero, zero, B, IB+k,
597 JB, DESCB );
598/*
599* B( IB:IB+k+kbb-1, JB:JB+N-1 ) := ALPHA * WA * WB
600*/
601 Bmp = PB_Cnumroc( ktmp, 0, Bimb1, Bmb, myrow, Brow, nprow );
602 if( ( Bmp > 0 ) && ( Bnq0 > 0 ) )
603 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp, &Bnq0,
604 &kbb, ALPHA, WA, &WAd[LLD_], WB, &WBd[LLD_], one,
605 Bptr0, &Bld );
606
607 if( WBfr ) free( WB );
608 if( WAfr ) free( WA );
609 if( Bfr ) free( Bptr );
610 if( Afr ) free( Aptr );
611 }
612 }
613 else
614 {
615 for( k = ( ( M - 1 ) / kb ) * kb; k >= 0; k -= kb )
616 {
617 ktmp = M - k; kbb = MIN( ktmp, kb );
618/*
619* Accumulate A( IA+k:IA+k+kbb-1, JA+k:JA+M-1 )
620*/
621 PB_CGatherV( TYPE, ALLOCATE, BACKWARD, kbb, ktmp, A, IA+k,
622 JA+k, DESCA, ROW, &Aptr, DBUFA, &Afr );
623/*
624* Replicate WA := A( IA+k:IA+k+kbb-1, JA+k:JA+M-1 )' over
625* B( IB+k:IB+M-1, JB:JB+N-1 )
626*/
627 Bcurimb1 = PB_Cfirstnb( ktmp, IB+k, Bimb, Bmb );
628 Bcurrow = PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow, nprow );
629 PB_Cdescset( Bd0, ktmp, N, Bcurimb1, Binb1, Bmb, Bnb, Bcurrow,
630 Bcol, ctxt, Bld );
631 PB_CInV( TYPE, &conjg, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0,
632 DBUFA, ROW, &WA, WAd, &WAfr );
633/*
634* Zero upper triangle of WA( 0:kbb-1, 0:kbb-1 )
635*/
636 if( unit )
637 PB_Cplapad( TYPE, UPPER, NOCONJG, kbb, kbb, zero, one,
638 WA, 0, 0, WAd );
639 else if( kbb > 1 )
640 PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero,
641 WA, 0, 1, WAd );
642/*
643* Accumulate B( IB+k:IB+k+kbb-1, JB:JB+N-1 )
644*/
645 PB_CGatherV( TYPE, ALLOCATE, BACKWARD, kbb, N, B, IB+k, JB,
646 DESCB, ROW, &Bptr, DBUFB, &Bfr );
647/*
648* Replicate WB := B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) over
649* B( IB+k:IB+M-1, JB:JB+N-1 )
650*/
651 PB_CInV( TYPE, NOCONJG, ROW, ktmp, N, Bd0, kbb, Bptr, 0, 0,
652 DBUFB, ROW, &WB, WBd, &WBfr );
653/*
654* Zero B( IB+k:IB+k+kbb-1, JB:JB+N-1 )
655*/
656 PB_Cplapad( TYPE, ALL, NOCONJG, kbb, N, zero, zero, B, IB+k,
657 JB, DESCB );
658/*
659* B( IB+k:IB+M-1, JB:JB+N-1 ) := ALPHA * WA * WB
660*/
661 Bmp = PB_Cnumroc( ktmp, k, Bimb1, Bmb, myrow, Brow, nprow );
662 if( ( Bmp > 0 ) && ( Bnq0 > 0 ) )
663 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp, &Bnq0,
664 &kbb, ALPHA, WA, &WAd[LLD_], WB, &WBd[LLD_], one,
665 Mptr( Bptr0, Bmp0-Bmp, 0, Bld, size ), &Bld );
666
667 if( WBfr ) free( WB );
668 if( WAfr ) free( WA );
669 if( Bfr ) free( Bptr );
670 if( Afr ) free( Aptr );
671 }
672 }
673 }
674 else
675 {
676 if( !upper )
677 {
678 for( k = ( ( N - 1 ) / kb ) * kb; k >= 0; k -= kb )
679 {
680 ktmp = N - k; kbb = MIN( ktmp, kb );
681/*
682* Accumulate A( IA+k:IA+N-1, JA+k:JA+k+kbb-1 )
683*/
684 PB_CGatherV( TYPE, ALLOCATE, BACKWARD, ktmp, kbb, A, IA+k,
685 JA+k, DESCA, COLUMN, &Aptr, DBUFA, &Afr );
686/*
687* Replicate WA := A( IA+k:IA+N-1, JA+k:JA+k+kbb-1 )' over
688* B( IB:IB+M-1, JB+k:JB+N-1 )
689*/
690 Bcurinb1 = PB_Cfirstnb( ktmp, JB+k, Binb, Bnb );
691 Bcurcol = PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol, npcol );
692 PB_Cdescset( Bd0, M, ktmp, Bimb1, Bcurinb1, Bmb, Bnb, Brow,
693 Bcurcol, ctxt, Bld );
694 PB_CInV( TYPE, &conjg, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0,
695 DBUFA, COLUMN, &WA, WAd, &WAfr );
696/*
697* Zero lower triangle of WA( 0:kbb-1, 0:kbb-1 )
698*/
699 if( unit )
700 PB_Cplapad( TYPE, LOWER, NOCONJG, kbb, kbb, zero, one,
701 WA, 0, 0, WAd );
702 else if( kbb > 1 )
703 PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero,
704 WA, 1, 0, WAd );
705/*
706* Accumulate B( IB:IB+M-1, JB+k:JB+k+kbb-1 )
707*/
708 PB_CGatherV( TYPE, ALLOCATE, BACKWARD, M, kbb, B, IB, JB+k,
709 DESCB, COLUMN, &Bptr, DBUFB, &Bfr );
710/*
711* Replicate WB := B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) over
712* B( IB:IB+M-1, JB+k:JB+N-1 )
713*/
714 PB_CInV( TYPE, NOCONJG, COLUMN, M, ktmp, Bd0, kbb, Bptr, 0, 0,
715 DBUFB, COLUMN, &WB, WBd, &WBfr );
716/*
717* Zero B( IB:IB+M-1, JB+k:JB+k+kbb-1 )
718*/
719 PB_Cplapad( TYPE, ALL, NOCONJG, M, kbb, zero, zero, B, IB,
720 JB+k, DESCB );
721/*
722* B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) := ALPHA * WB * WA
723*/
724 Bnq = PB_Cnumroc( ktmp, k, Binb1, Bnb, mycol, Bcol, npcol );
725 if( ( Bmp0 > 0 ) && ( Bnq > 0 ) )
726 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp0, &Bnq,
727 &kbb, ALPHA, WB, &WBd[LLD_], WA, &WAd[LLD_], one,
728 Mptr( Bptr0, 0, Bnq0-Bnq, Bld, size ), &Bld );
729
730 if( WBfr ) free( WB );
731 if( WAfr ) free( WA );
732 if( Bfr ) free( Bptr );
733 if( Afr ) free( Aptr );
734 }
735 }
736 else
737 {
738 for( k = 0; k < N; k += kb )
739 {
740 kbb = N - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
741/*
742* Accumulate A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 )
743*/
744 PB_CGatherV( TYPE, ALLOCATE, FORWARD, ktmp, kbb, A, IA, JA+k,
745 DESCA, COLUMN, &Aptr, DBUFA, &Afr );
746/*
747* Replicate WA := A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 )' over
748* B( IB:IB+M-1, JB:JB+k+kbb-1 )
749*/
750 PB_Cdescset( Bd0, M, ktmp, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
751 ctxt, Bld );
752 PB_CInV( TYPE, &conjg, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0,
753 DBUFA, COLUMN, &WA, WAd, &WAfr );
754/*
755* Zero upper triangle of WA( 0:kbb-1, k:k+kbb-1 )
756*/
757 if( unit )
758 PB_Cplapad( TYPE, UPPER, NOCONJG, kbb, kbb, zero, one,
759 WA, 0, k, WAd );
760 else if( kbb > 1 )
761 PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero,
762 WA, 0, k+1, WAd );
763/*
764* Accumulate B( IB:IB+M-1, JB+k:JB+k+kbb-1 )
765*/
766 PB_CGatherV( TYPE, ALLOCATE, FORWARD, M, kbb, B, IB, JB+k,
767 DESCB, COLUMN, &Bptr, DBUFB, &Bfr );
768/*
769* Replicate WB := B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) over
770* B( IB:IB+M-1, JB:JB+k+kbb-1 )
771*/
772 PB_CInV( TYPE, NOCONJG, COLUMN, M, ktmp, Bd0, kbb, Bptr, 0, 0,
773 DBUFB, COLUMN, &WB, WBd, &WBfr );
774/*
775* Zero B( IB:IB+M-1, JB+k:JB+k+kbb-1 )
776*/
777 PB_Cplapad( TYPE, ALL, NOCONJG, M, kbb, zero, zero, B, IB,
778 JB+k, DESCB );
779/*
780* B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) := ALPHA * WB * WA
781*/
782 Bnq = PB_Cnumroc( ktmp, 0, Binb1, Bnb, mycol, Bcol, npcol );
783 if( ( Bmp0 > 0 ) && ( Bnq > 0 ) )
784 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp0, &Bnq,
785 &kbb, ALPHA, WB, &WBd[LLD_], WA, &WAd[LLD_], one,
786 Bptr0, &Bld );
787
788 if( WBfr ) free( WB );
789 if( WAfr ) free( WA );
790 if( Bfr ) free( Bptr );
791 if( Afr ) free( Aptr );
792 }
793 }
794 }
795 }
796 else
797 {
798/*
799* Left looking variant for the transpose cases
800*/
801 if( lside )
802 {
803 top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
804
805 if( upper )
806 {
807 for( k = ( ( M - 1 ) / kb ) * kb; k >= 0; k -= kb )
808 {
809 kbb = M - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
810/*
811* Accumulate A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 )
812*/
813 PB_CGatherV( TYPE, ALLOCATE, BACKWARD, ktmp, kbb, A, IA, JA+k,
814 DESCA, COLUMN, &Aptr, DBUFA, &Afr );
815/*
816* Replicate WA := A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 ) over
817* B( IB:IB+k+kbb-1, JB:JB+N-1 )
818*/
819 PB_Cdescset( Bd0, ktmp, N, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
820 ctxt, Bld );
821 PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0,
822 DBUFA, COLUMN, &WA, WAd, &WAfr );
823/*
824* Zero lower triangle of WA( k:k+kbb-1, 0:kbb-1 )
825*/
826 if( unit )
827 PB_Cplapad( TYPE, LOWER, NOCONJG, kbb, kbb, zero, one,
828 WA, k, 0, WAd );
829 else if( kbb > 1 )
830 PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero,
831 WA, k+1, 0, WAd );
832/*
833* WB := A( IA:IA+k+kbb-1, JA+k:JA+k+kbb-1 )' * B( IB:IB+k+kbb-1, JB:JB+N-1 )
834*/
835 PB_COutV( TYPE, ROW, INIT, ktmp, N, Bd0, kbb, &WB, WBd, &WBfr,
836 &WBsum );
837 Bmp = PB_Cnumroc( ktmp, 0, Bimb1, Bmb, myrow, Brow, nprow );
838 if( ( Bnq0 > 0 ) && ( Bmp > 0 ) )
839 gemm( C2F_CHAR( TRANSA ), C2F_CHAR( NOTRAN ), &kbb, &Bnq0,
840 &Bmp, ALPHA, WA, &WAd[LLD_], Bptr0, &Bld, zero, WB,
841 &WBd[LLD_] );
842 if( WBsum )
843 {
844 WBd[RSRC_] = PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow,
845 nprow );
846 if( Bnq0 > 0 )
847 gsum2d( ctxt, COLUMN, &top, kbb, Bnq0, WB, WBd[LLD_],
848 WBd[RSRC_], mycol );
849 }
850 if( WAfr ) free( WA );
851 if( Afr ) free( Aptr );
852/*
853* B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) := WB
854*/
855 PB_CScatterV( TYPE, FORWARD, kbb, N, WB, 0, 0, WBd, ROW, zero,
856 B, IB+k, JB, DESCB, ROW );
857 if( WBfr ) free( WB );
858 }
859 }
860 else
861 {
862 for( k = 0; k < M; k += kb )
863 {
864 ktmp = M - k; kbb = MIN( ktmp, kb );
865/*
866* Accumulate A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 )
867*/
868 PB_CGatherV( TYPE, ALLOCATE, FORWARD, ktmp, kbb, A, IA+k,
869 JA+k, DESCA, COLUMN, &Aptr, DBUFA, &Afr );
870/*
871* Replicate WA := A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 ) over
872* B( IB+k:IB+M-1, JB:JB+N-1 )
873*/
874 Bcurimb1 = PB_Cfirstnb( ktmp, IB+k, Bimb, Bmb );
875 Bcurrow = PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow, nprow );
876 PB_Cdescset( Bd0, ktmp, N, Bcurimb1, Binb1, Bmb, Bnb, Bcurrow,
877 Bcol, ctxt, Bld );
878 PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0,
879 DBUFA, COLUMN, &WA, WAd, &WAfr );
880/*
881* Zero upper triangle of WA( 0:kbb-1, 0:kbb-1 )
882*/
883 if( unit )
884 PB_Cplapad( TYPE, UPPER, NOCONJG, kbb, kbb, zero, one,
885 WA, 0, 0, WAd );
886 else if( kbb > 1 )
887 PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero,
888 WA, 0, 1, WAd );
889/*
890* WB := A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 )' * B( IB+k:IB+M-1, JB:JB+N-1 )
891*/
892 PB_COutV( TYPE, ROW, INIT, ktmp, N, Bd0, kbb, &WB, WBd, &WBfr,
893 &WBsum );
894 Bmp = PB_Cnumroc( ktmp, k, Bimb1, Bmb, myrow, Brow, nprow );
895 if( ( Bnq0 > 0 ) && ( Bmp > 0 ) )
896 gemm( C2F_CHAR( TRANSA ), C2F_CHAR( NOTRAN ), &kbb, &Bnq0,
897 &Bmp, ALPHA, WA, &WAd[LLD_], Mptr( Bptr0, Bmp0-Bmp,
898 0, Bld, size ), &Bld, zero, WB, &WBd[LLD_] );
899 if( WBsum )
900 {
901 WBd[RSRC_] = PB_Cindxg2p( k + kbb - 1, Bimb1, Bmb, Brow,
902 Brow, nprow );
903 if( Bnq0 > 0 )
904 gsum2d( ctxt, COLUMN, &top, kbb, Bnq0, WB, WBd[LLD_],
905 WBd[RSRC_], mycol );
906
907 }
908 if( WAfr ) free( WA );
909 if( Afr ) free( Aptr );
910/*
911* B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) := WB
912*/
913 PB_CScatterV( TYPE, BACKWARD, kbb, N, WB, 0, 0, WBd, ROW,
914 zero, B, IB+k, JB, DESCB, ROW );
915 if( WBfr ) free( WB );
916 }
917 }
918 }
919 else
920 {
921 top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
922
923 if( upper )
924 {
925 for( k = 0; k < N; k += kb )
926 {
927 ktmp = N - k; kbb = MIN( ktmp, kb );
928/*
929* Accumulate A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 )
930*/
931 PB_CGatherV( TYPE, ALLOCATE, FORWARD, kbb, ktmp, A, IA+k,
932 JA+k, DESCA, ROW, &Aptr, DBUFA, &Afr );
933/*
934* Replicate WA := A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) over
935* B( IB:IB+M-1, JB+k:JB+N-1 )
936*/
937 Bcurinb1 = PB_Cfirstnb( ktmp, JB+k, Binb, Bnb );
938 Bcurcol = PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol, npcol );
939 PB_Cdescset( Bd0, M, ktmp, Bimb1, Bcurinb1, Bmb, Bnb, Brow,
940 Bcurcol, ctxt, Bld );
941 PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0,
942 DBUFA, ROW, &WA, WAd, &WAfr );
943/*
944* Zero lower triangle of WA( 0:kbb-1, 0:kbb-1 )
945*/
946 if( unit )
947 PB_Cplapad( TYPE, LOWER, NOCONJG, kbb, kbb, zero, one,
948 WA, 0, 0, WAd );
949 else if( kbb > 1 )
950 PB_Cplapad( TYPE, LOWER, NOCONJG, kbb-1, kbb-1, zero, zero,
951 WA, 1, 0, WAd );
952/*
953* WB := B( IB:IB+M-1, JB+k:JB+N-1 ) * A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 )'
954*/
955 PB_COutV( TYPE, COLUMN, INIT, M, ktmp, Bd0, kbb, &WB, WBd,
956 &WBfr, &WBsum );
957 Bnq = PB_Cnumroc( ktmp, k, Binb1, Bnb, mycol, Bcol, npcol );
958 if( ( Bmp0 > 0 ) && ( Bnq > 0 ) )
959 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRANSA ), &Bmp0, &kbb,
960 &Bnq, ALPHA, Mptr( Bptr0, 0, Bnq0-Bnq, Bld, size ),
961 &Bld, WA, &WAd[LLD_], zero, WB, &WBd[LLD_] );
962 if( WBsum )
963 {
964 WBd[CSRC_] = PB_Cindxg2p( k + kbb - 1, Binb1, Bnb, Bcol,
965 Bcol, npcol );
966 if( Bmp0 > 0 )
967 gsum2d( ctxt, ROW, &top, Bmp0, kbb, WB, WBd[LLD_],
968 myrow, WBd[CSRC_] );
969 }
970 if( WAfr ) free( WA );
971 if( Afr ) free( Aptr );
972/*
973* B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) := WB
974*/
975 PB_CScatterV( TYPE, BACKWARD, M, kbb, WB, 0, 0, WBd, COLUMN,
976 zero, B, IB, JB+k, DESCB, COLUMN );
977 if( WBfr ) free( WB );
978 }
979 }
980 else
981 {
982 for( k = ( ( N - 1 ) / kb ) * kb; k >= 0; k -= kb )
983 {
984 kbb = N - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
985/*
986* Accumulate A( IA+k:IA+k+kbb-1, JA:JA+k+kbb-1 )
987*/
988 PB_CGatherV( TYPE, ALLOCATE, BACKWARD, kbb, ktmp, A, IA+k, JA,
989 DESCA, ROW, &Aptr, DBUFA, &Afr );
990/*
991* Replicate WA := A( IA+k:IA+k+kbb-1, JA:JA+k+kbb-1 ) over
992* B( IB:IB+M-1, JB:JB+k+kbb-1 )
993*/
994 PB_Cdescset( Bd0, M, ktmp, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
995 ctxt, Bld );
996 PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0,
997 DBUFA, ROW, &WA, WAd, &WAfr );
998/*
999* Zero upper triangle of WA( 0:kbb-1, k:k+kbb-1 )
1000*/
1001 if( unit )
1002 PB_Cplapad( TYPE, UPPER, NOCONJG, kbb, kbb, zero, one,
1003 WA, 0, k, WAd );
1004 else if( kbb > 1 )
1005 PB_Cplapad( TYPE, UPPER, NOCONJG, kbb-1, kbb-1, zero, zero,
1006 WA, 0, k+1, WAd );
1007/*
1008* WB := B( IB:IB+M-1, JB:JB+k+kbb-1 ) * A( IA+k:IA+k+kbb-1, JA:JA+k+kbb-1 )'
1009*/
1010 PB_COutV( TYPE, COLUMN, INIT, M, ktmp, Bd0, kbb, &WB, WBd,
1011 &WBfr, &WBsum );
1012 Bnq = PB_Cnumroc( ktmp, 0, Binb1, Bnb, mycol, Bcol, npcol );
1013 if( ( Bmp0 > 0 ) && ( Bnq > 0 ) )
1014 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRANSA ), &Bmp0, &kbb,
1015 &Bnq, ALPHA, Bptr0, &Bld, WA, &WAd[LLD_], zero, WB,
1016 &WBd[LLD_] );
1017 if( WBsum )
1018 {
1019 WBd[CSRC_] = PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol,
1020 npcol );
1021 if( Bmp0 > 0 )
1022 gsum2d( ctxt, ROW, &top, Bmp0, kbb, WB, WBd[LLD_],
1023 myrow, WBd[CSRC_] );
1024 }
1025 if( WAfr ) free( WA );
1026 if( Afr ) free( Aptr );
1027/*
1028* B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) := WB
1029*/
1030 PB_CScatterV( TYPE, FORWARD, M, kbb, WB, 0, 0, WBd, COLUMN,
1031 zero, B, IB, JB+k, DESCB, COLUMN );
1032 if( WBfr ) free( WB );
1033 }
1034 }
1035 }
1036 }
1037 }
1038/*
1039* End of PB_CptrmmAB
1040*/
1041}
#define Int
Definition Bconfig.h:22
F_VOID_FCT(* GEMM_T)()
Definition pblas.h:317
void(* GSUM2D_T)()
Definition pblas.h:286
#define C2F_CHAR(a)
Definition pblas.h:125
#define TOP_GET
Definition PBblacs.h:50
#define COLUMN
Definition PBblacs.h:45
#define COMBINE
Definition PBblacs.h:49
#define ROW
Definition PBblacs.h:46
void Cblacs_gridinfo()
#define NOTRAN
Definition PBblas.h:44
#define CCONJG
Definition PBblas.h:21
#define ALL
Definition PBblas.h:50
#define CLEFT
Definition PBblas.h:29
#define CNOCONJG
Definition PBblas.h:19
#define NOCONJG
Definition PBblas.h:45
#define CUPPER
Definition PBblas.h:26
#define CNOTRAN
Definition PBblas.h:18
#define FORWARD
Definition PBblas.h:64
#define CRIGHT
Definition PBblas.h:30
#define LOWER
Definition PBblas.h:51
#define ALLOCATE
Definition PBblas.h:68
#define CCOTRAN
Definition PBblas.h:22
#define INIT
Definition PBblas.h:61
#define UPPER
Definition PBblas.h:52
#define BACKWARD
Definition PBblas.h:65
#define CUNIT
Definition PBblas.h:32
#define pilaenv_
Definition PBpblas.h:44
#define CTXT_
Definition PBtools.h:38
Int PB_Cfirstnb()
#define MB_
Definition PBtools.h:43
void PB_CptrmmAB()
void PB_Cinfog2l()
#define MIN(a_, b_)
Definition PBtools.h:76
#define Mptr(a_, i_, j_, lda_, siz_)
Definition PBtools.h:132
#define LLD_
Definition PBtools.h:47
void PB_CGatherV()
Int PB_Cnumroc()
char * PB_Ctop()
void PB_CInV()
void PB_CScatterV()
void PB_Cplapad()
#define RSRC_
Definition PBtools.h:45
void PB_Cdescset()
void PB_COutV()
#define INB_
Definition PBtools.h:42
#define CSRC_
Definition PBtools.h:46
#define IMB_
Definition PBtools.h:41
Int PB_Cindxg2p()
#define Mupcase(C)
Definition PBtools.h:83
#define DLEN_
Definition PBtools.h:48
#define NB_
Definition PBtools.h:44
#define TYPE
Definition clamov.c:7
Int size
Definition pblas.h:333