ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
PB_CInV2.c
Go to the documentation of this file.
1 /* ---------------------------------------------------------------------
2 *
3 * -- PBLAS auxiliary routine (version 2.0) --
4 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5 * and University of California, Berkeley.
6 * April 1, 1998
7 *
8 * ---------------------------------------------------------------------
9 */
10 /*
11 * Include files
12 */
13 #include "../pblas.h"
14 #include "../PBpblas.h"
15 #include "../PBtools.h"
16 #include "../PBblacs.h"
17 #include "../PBblas.h"
18 
19 #ifdef __STDC__
20 void PB_CInV2( PBTYP_T * TYPE, char * CONJUG, char * ROWCOL, int M,
21  int N, int * DESCA, int K, char * X, int IX, int JX,
22  int * DESCX, char * XROC, char * XAPTR, int IJXA,
23  int * DXA )
24 #else
25 void PB_CInV2( TYPE, CONJUG, ROWCOL, M, N, DESCA, K, X, IX, JX, DESCX,
26  XROC, XAPTR, IJXA, DXA )
27 /*
28 * .. Scalar Arguments ..
29 */
30  char * CONJUG, * ROWCOL, * XROC;
31  int IJXA, IX, JX, K, M, N;
32  PBTYP_T * TYPE;
33 /*
34 * .. Array Arguments ..
35 */
36  int * DESCA, * DESCX, * DXA;
37  char * X, * XAPTR;
38 #endif
39 {
40 /*
41 * Purpose
42 * =======
43 *
44 * PB_CInV2 adds data to an array that contains a one-dimensional input
45 * only subvector which is replicated over the rows or columns of a sub-
46 * matrix described by DESCA. A subvector is specified on input to this
47 * routine that is added to the replicated buffer. This routine is spe-
48 * cifically designed for LCM hybrid variants.
49 *
50 * Notes
51 * =====
52 *
53 * A description vector is associated with each 2D block-cyclicly dis-
54 * tributed matrix. This vector stores the information required to
55 * establish the mapping between a matrix entry and its corresponding
56 * process and memory location.
57 *
58 * In the following comments, the character _ should be read as
59 * "of the distributed matrix". Let A be a generic term for any 2D
60 * block cyclicly distributed matrix. Its description vector is DESC_A:
61 *
62 * NOTATION STORED IN EXPLANATION
63 * ---------------- --------------- ------------------------------------
64 * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
65 * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
66 * the NPROW x NPCOL BLACS process grid
67 * A is distributed over. The context
68 * itself is global, but the handle
69 * (the integer value) may vary.
70 * M_A (global) DESCA[ M_ ] The number of rows in the distribu-
71 * ted matrix A, M_A >= 0.
72 * N_A (global) DESCA[ N_ ] The number of columns in the distri-
73 * buted matrix A, N_A >= 0.
74 * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
75 * block of the matrix A, IMB_A > 0.
76 * INB_A (global) DESCA[ INB_ ] The number of columns of the upper
77 * left block of the matrix A,
78 * INB_A > 0.
79 * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
80 * bute the last M_A-IMB_A rows of A,
81 * MB_A > 0.
82 * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
83 * bute the last N_A-INB_A columns of
84 * A, NB_A > 0.
85 * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
86 * row of the matrix A is distributed,
87 * NPROW > RSRC_A >= 0.
88 * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
89 * first column of A is distributed.
90 * NPCOL > CSRC_A >= 0.
91 * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
92 * array storing the local blocks of
93 * the distributed matrix A,
94 * IF( Lc( 1, N_A ) > 0 )
95 * LLD_A >= MAX( 1, Lr( 1, M_A ) )
96 * ELSE
97 * LLD_A >= 1.
98 *
99 * Let K be the number of rows of a matrix A starting at the global in-
100 * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
101 * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
102 * receive if these K rows were distributed over NPROW processes. If K
103 * is the number of columns of a matrix A starting at the global index
104 * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
105 * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
106 * these K columns were distributed over NPCOL processes.
107 *
108 * The values of Lr() and Lc() may be determined via a call to the func-
109 * tion PB_Cnumroc:
110 * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
111 * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
112 *
113 * Arguments
114 * =========
115 *
116 * TYPE (local input) pointer to a PBTYP_T structure
117 * On entry, TYPE is a pointer to a structure of type PBTYP_T,
118 * that contains type information (See pblas.h).
119 *
120 * CONJUG (global input) pointer to CHAR
121 * On entry, CONJUG specifies if this routine should conjugate
122 * the subvector as follows:
123 * = 'N' or 'n': The initial subvector is copied,
124 * = 'Z' or 'z': The conjugate subvector is copied.
125 *
126 * ROWCOL (global input) pointer to CHAR
127 * On entry, ROWCOL specifies if the existing buffer pointed to
128 * XAPTR is a row or column subvector replicated over the under-
129 * lying submatrix as follows:
130 * = 'R' or 'r': XAPTR is a row subvector,
131 * = 'C' or 'c': XAPTR is a column subvector.
132 *
133 * M (global input) INTEGER
134 * On entry, M specifies the number of rows of the underlying
135 * submatrix described by DESCA. M must be at least zero.
136 *
137 * N (global input) INTEGER
138 * On entry, N specifies the number of columns of the underlying
139 * submatrix described by DESCA. N must be at least zero.
140 *
141 * DESCA (global and local input) INTEGER array
142 * On entry, DESCA is an integer array of dimension DLEN_. This
143 * is the array descriptor for the matrix A.
144 *
145 * K (global input) INTEGER
146 * On entry, K specifies the length of the non-distributed di-
147 * mension of the subvector sub( X ). K must be at least zero.
148 *
149 * X (local input) pointer to CHAR
150 * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X
151 * is at least MAX( 1, Lr( K, IX ) ) when XROC is 'R' or 'r'
152 * and MAX( 1, Lr( 1, IX+Lx-1 ) ) otherwise, and, Kx is at least
153 * Lc( 1, JX+Lx-1 ) when INCX = M_X and Lc( K, JX ) otherwise.
154 * Lx is N when ROWCOL = 'R' or 'r' and M otherwise. Before en-
155 * try, this array contains the local entries of the matrix X.
156 *
157 * IX (global input) INTEGER
158 * On entry, IX specifies X's global row index, which points to
159 * the beginning of the submatrix sub( X ).
160 *
161 * JX (global input) INTEGER
162 * On entry, JX specifies X's global column index, which points
163 * to the beginning of the submatrix sub( X ).
164 *
165 * DESCX (global and local input) INTEGER array
166 * On entry, DESCX is an integer array of dimension DLEN_. This
167 * is the array descriptor for the matrix X.
168 *
169 * XROC (global input) pointer to CHAR
170 * On entry, XROC specifies the orientation of the subvector
171 * sub( X ). When XROC is 'R' or 'r', sub( X ) is a row vector,
172 * and a column vector otherwise.
173 *
174 * XAPTR (local input/local output) pointer to CHAR
175 * On entry, XAPTR is an array containing some initial data. On
176 * exit, the subvector sub( X ) is copied into this array which
177 * is replicated over the rows or columns of the underlying ma-
178 * trix as specified by ROWCOL and DESCA.
179 *
180 * IJXA (global input) INTEGER
181 * On entry, IJXA specifies XA global row or column index depen-
182 * ding on ROWCOL in the array pointed to by XAPTR, where the
183 * subvector sub( X ) should copied.
184 *
185 * DXA (global and local input) INTEGER array
186 * On entry, DXA is a descriptor array of dimension DLEN_ des-
187 * cribing the data layout of the data pointed to by XAPTR.
188 *
189 * -- Written on April 1, 1998 by
190 * Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
191 *
192 * ---------------------------------------------------------------------
193 */
194 /*
195 * .. Local Scalars ..
196 */
197  char * Xptr = NULL, * top;
198  int AColSpan, ARowSpan, Acol, Aimb, Ainb, AisD, Amb, Amp, Anb,
199  Anq, Arow, XAld, Xcol, Xii, Ximb1, Xinb1, XisD, XisR, XisRow,
200  Xjj, Xld=1, Xmb, Xnb, Xrow, ctxt, mycol, myrow, npcol, nprow,
201  size;
202 /* ..
203 * .. Executable Statements ..
204 *
205 */
206 /*
207 * Quick return if possible
208 */
209  if( ( M <= 0 ) || ( N <= 0 ) || ( K <= 0 ) ) return;
210 /*
211 * Retrieve process grid information
212 */
213  Cblacs_gridinfo( ( ctxt = DESCX[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
214 /*
215 * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Ycol
216 */
217  Minfog2l( IX, JX, DESCX, nprow, npcol, myrow, mycol, Xii, Xjj, Xrow, Xcol );
218 /*
219 * Is sub( X ) distributed or not, replicated or not ?
220 */
221  if( ( XisRow = ( Mupcase( XROC[0] ) == CROW ) ) != 0 )
222  {
223  XisD = ( ( Xcol >= 0 ) && ( npcol > 1 ) );
224  XisR = ( ( Xrow == -1 ) || ( nprow == 1 ) );
225  }
226  else
227  {
228  XisD = ( ( Xrow >= 0 ) && ( nprow > 1 ) );
229  XisR = ( ( Xcol == -1 ) || ( npcol == 1 ) );
230  }
231 
232  Arow = DESCA[ RSRC_ ]; Acol = DESCA[ CSRC_ ];
233 
234  if( Mupcase( ROWCOL[0] ) == CROW )
235  {
236 /*
237 * Want a row vector. It is possible to reuse sub( X ) iff sub( X ) is already
238 * a row vector and the data does not need to be conjugated.
239 */
240  if( XisRow && ( Mupcase( CONJUG[0] ) == CNOCONJG ) )
241  {
242  AisD = ( ( Acol >= 0 ) && ( npcol > 1 ) );
243  Ainb = DESCA[INB_]; Anb = DESCA[NB_]; Xnb = DESCX[NB_];
244  Mfirstnb( Xinb1, N, JX, DESCX[INB_], Xnb );
245 /*
246 * sub( X ) is aligned with A (reuse condition) iff both operands are not
247 * distributed, or both of them are distributed and start in the same process
248 * column and either N is smaller than the first blocksize of sub( X ) and A,
249 * or their column blocking factors match.
250 */
251  if( ( !AisD && !XisD ) ||
252  ( ( AisD && XisD ) &&
253  ( ( Acol == Xcol ) &&
254  ( ( ( Ainb >= N ) && ( Xinb1 >= N ) ) ||
255  ( ( Ainb == Xinb1 ) && ( Anb == Xnb ) ) ) ) ) )
256  {
257 /*
258 * sub( X ) is aligned with A. Does A spans multiples process rows ? It does
259 * if Arow < 0.
260 */
261  ARowSpan = ( Arow < 0 ) ||
262  Mspan( M, 0, DESCA[IMB_], DESCA[MB_], Arow, nprow );
263 
264  Mnumroc( Anq, N, 0, Ainb, Anb, mycol, Acol, npcol );
265 
266  if( XisR || ( !ARowSpan && ( Arow == Xrow ) ) )
267  {
268 /*
269 * If sub( X ) is replicated, or, A spans only one process row and either
270 * sub( X ) is replicated or resides in the same process row than A, then
271 * sub( X ) is already at the correct place.
272 */
273  if( ( Anq > 0 ) && ( ARowSpan || ( myrow == Arow ) ) )
274  {
275  size = TYPE->size; Xld = DESCX[ LLD_ ]; XAld = DXA[LLD_];
276  TYPE->Fmmadd( &K, &Anq, TYPE->one, Mptr( X, Xii, Xjj, Xld,
277  size ), &Xld, TYPE->zero, Mptr( XAPTR, IJXA,
278  0, XAld, size ), &XAld );
279  }
280  }
281  else if( ARowSpan )
282  {
283 /*
284 * Otherwise, we know that sub( X ) cannot be replicated, let suppose in
285 * addition that A spans all process rows. sub( X ) need simply to be broadcast
286 * over A.
287 */
288  if( myrow == Xrow )
289  {
290  if( Anq > 0 )
291  {
292  top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
293  size = TYPE->size; Xld = DESCX[LLD_]; XAld = DXA[LLD_];
294  Xptr = Mptr( XAPTR, IJXA, 0, XAld, size );
295  TYPE->Fmmadd( &K, &Anq, TYPE->one, Mptr( X, Xii, Xjj, Xld,
296  size ), &Xld, TYPE->zero, Xptr, &XAld );
297  TYPE->Cgebs2d( ctxt, COLUMN, top, K, Anq, Xptr, XAld );
298  }
299  }
300  else
301  {
302  if( Anq > 0 )
303  {
304  top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
305  XAld = DXA[LLD_];
306  TYPE->Cgebr2d( ctxt, COLUMN, top, K, Anq, Mptr( XAPTR,
307  IJXA, 0, XAld, TYPE->size ), XAld, Xrow,
308  mycol );
309  }
310  }
311  }
312  else
313  {
314 /*
315 * Finally, sub( X ) is not replicated and A spans only one process row. There
316 * is no need to broadcast, a send/recv is sufficient.
317 */
318  if( myrow == Xrow )
319  {
320  if( Anq > 0 )
321  {
322  Xld = DESCX[LLD_];
323  TYPE->Cgesd2d( ctxt, K, Anq, Mptr( X, Xii, Xjj, Xld,
324  TYPE->size ), Xld, Arow, mycol );
325  }
326  }
327  else if( myrow == Arow )
328  {
329  if( Anq > 0 )
330  {
331  XAld = DXA[LLD_];
332  TYPE->Cgerv2d( ctxt, K, Anq, Mptr( XAPTR, IJXA, 0, XAld,
333  TYPE->size ), XAld, Xrow, mycol );
334  }
335  }
336  }
337  return;
338  }
339  }
340 /*
341 * sub( X ) cannot be reused, too bad ... redistribute
342 */
343  if( XisRow )
344  {
345  PB_Cpaxpby( TYPE, CONJUG, K, N, TYPE->one, X, IX, JX, DESCX, XROC,
346  TYPE->zero, XAPTR, IJXA, 0, DXA, ROW );
347  }
348  else
349  {
350  PB_Cpaxpby( TYPE, CONJUG, N, K, TYPE->one, X, IX, JX, DESCX, XROC,
351  TYPE->zero, XAPTR, IJXA, 0, DXA, ROW );
352  }
353  }
354  else
355  {
356 /*
357 * Want a column vector. It is possible to reuse sub( X ) iff sub( X ) is
358 * already a column vector and the data does not need to be conjugated
359 */
360  if( !( XisRow ) && ( Mupcase( CONJUG[0] ) == CNOCONJG ) )
361  {
362  AisD = ( ( Arow >= 0 ) && ( nprow > 1 ) );
363  Aimb = DESCA[IMB_]; Amb = DESCA[MB_]; Xmb = DESCX[MB_];
364  Mfirstnb( Ximb1, M, IX, DESCX[IMB_], Xmb );
365 /*
366 * sub( X ) is aligned with A (reuse condition) iff both operands are not
367 * distributed, or both of them are distributed and start in the same process
368 * row and either M is smaller than the first blocksize of sub( X ) and A, or
369 * their row blocking factors match.
370 */
371  if( ( !AisD && !XisD ) ||
372  ( ( AisD && XisD ) &&
373  ( ( Arow == Xrow ) &&
374  ( ( ( Aimb >= M ) && ( Ximb1 >= M ) ) ||
375  ( ( Aimb == Ximb1 ) && ( Amb == Xmb ) ) ) ) ) )
376  {
377 /*
378 * sub( X ) is aligned with A. Does A spans multiples process columns ? It
379 * does if Acol < 0.
380 */
381  AColSpan = ( Acol < 0 ) ||
382  Mspan( N, 0, DESCA[INB_], DESCA[NB_], Acol, npcol );
383 
384  Mnumroc( Amp, M, 0, Aimb, Amb, myrow, Arow, nprow );
385 
386  if( XisR || ( !AColSpan && ( Acol == Xcol ) ) )
387  {
388 /*
389 * If sub( X ) is replicated, or, A spans only one process column and either
390 * sub( X ) is replicated or resides in the same process columns than A, then
391 * sub( X ) is already at the correct place.
392 */
393  if( ( Amp > 0 ) && ( AColSpan || ( mycol == Acol ) ) )
394  {
395  size = TYPE->size; Xld = DESCX[ LLD_ ]; XAld = DXA[LLD_];
396  TYPE->Fmmadd( &Amp, &K, TYPE->one, Mptr( X, Xii, Xjj, Xld,
397  size ), &Xld, TYPE->zero, Mptr( XAPTR, 0, IJXA,
398  XAld, size ), &XAld );
399  }
400  }
401  else if( AColSpan )
402  {
403 /*
404 * Otherwise, we know that sub( X ) is not be replicated, let suppose in
405 * addition that A spans all process columns. sub( X ) need simply to be
406 * broadcast over A.
407 */
408  if( mycol == Xcol )
409  {
410  if( Amp > 0 )
411  {
412  top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
413  size = TYPE->size; Xld = DESCX[LLD_]; XAld = DXA[LLD_];
414  Xptr = Mptr( XAPTR, 0, IJXA, XAld, size );
415  TYPE->Fmmadd( &Amp, &K, TYPE->one, Mptr( X, Xii, Xjj, Xld,
416  size ), &Xld, TYPE->zero, Xptr, &XAld );
417  TYPE->Cgebs2d( ctxt, ROW, top, Amp, K, Xptr, XAld );
418  }
419  }
420  else
421  {
422  if( Amp > 0 )
423  {
424  top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
425  XAld = DXA[LLD_];
426  TYPE->Cgebr2d( ctxt, ROW, top, Amp, K, Mptr( XAPTR, 0,
427  IJXA, XAld, TYPE->size ), XAld, myrow,
428  Xcol );
429  }
430  }
431  }
432  else
433  {
434 /*
435 * Finally, sub( X ) is not replicated and A spans only one process column.
436 * There is no need to broadcast, a send/recv is sufficient.
437 */
438  if( mycol == Xcol )
439  {
440  if( Amp > 0 )
441  {
442  Xld = DESCX[LLD_];
443  TYPE->Cgesd2d( ctxt, Amp, K, Mptr( X, Xii, Xjj, Xld,
444  TYPE->size ), Xld, myrow, Acol );
445  }
446  }
447  else if( mycol == Acol )
448  {
449  if( Amp > 0 )
450  {
451  XAld = DXA[LLD_];
452  TYPE->Cgerv2d( ctxt, Amp, K, Mptr( XAPTR, 0, IJXA, XAld,
453  TYPE->size ), XAld, myrow, Xcol );
454  }
455  }
456  }
457  return;
458  }
459  }
460 /*
461 * sub( X ) cannot be reused, too bad ... redistribute
462 */
463  if( XisRow )
464  {
465  PB_Cpaxpby( TYPE, CONJUG, K, M, TYPE->one, X, IX, JX, DESCX, XROC,
466  TYPE->zero, XAPTR, 0, IJXA, DXA, COLUMN );
467  }
468  else
469  {
470  PB_Cpaxpby( TYPE, CONJUG, M, K, TYPE->one, X, IX, JX, DESCX, XROC,
471  TYPE->zero, XAPTR, 0, IJXA, DXA, COLUMN );
472  }
473  }
474 /*
475 * End of PB_CInV2
476 */
477 }
TYPE
#define TYPE
Definition: clamov.c:7
ROW
#define ROW
Definition: PBblacs.h:46
MB_
#define MB_
Definition: PBtools.h:43
PB_Cpaxpby
void PB_Cpaxpby()
PB_CInV2
void PB_CInV2(PBTYP_T *TYPE, char *CONJUG, char *ROWCOL, int M, int N, int *DESCA, int K, char *X, int IX, int JX, int *DESCX, char *XROC, char *XAPTR, int IJXA, int *DXA)
Definition: PB_CInV2.c:25
NB_
#define NB_
Definition: PBtools.h:44
COLUMN
#define COLUMN
Definition: PBblacs.h:45
CSRC_
#define CSRC_
Definition: PBtools.h:46
Mspan
#define Mspan(n_, i_, inb_, nb_, srcproc_, nprocs_)
Definition: PBtools.h:160
Mnumroc
#define Mnumroc(np_, n_, i_, inb_, nb_, proc_, srcproc_, nprocs_)
Definition: PBtools.h:222
LLD_
#define LLD_
Definition: PBtools.h:47
CNOCONJG
#define CNOCONJG
Definition: PBblas.h:19
CROW
#define CROW
Definition: PBblacs.h:21
IMB_
#define IMB_
Definition: PBtools.h:41
Minfog2l
#define Minfog2l(i_, j_, desc_, nr_, nc_, r_, c_, ii_, jj_, pr_, pc_)
Definition: PBtools.h:428
TOP_GET
#define TOP_GET
Definition: PBblacs.h:50
Mfirstnb
#define Mfirstnb(inbt_, n_, i_, inb_, nb_)
Definition: PBtools.h:139
PB_Ctop
char * PB_Ctop()
RSRC_
#define RSRC_
Definition: PBtools.h:45
BCAST
#define BCAST
Definition: PBblacs.h:48
INB_
#define INB_
Definition: PBtools.h:42
Cblacs_gridinfo
void Cblacs_gridinfo()
PBTYP_T
Definition: pblas.h:325
Mupcase
#define Mupcase(C)
Definition: PBtools.h:83
Mptr
#define Mptr(a_, i_, j_, lda_, siz_)
Definition: PBtools.h:132
CTXT_
#define CTXT_
Definition: PBtools.h:38