ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
PB_CpdotNN.c
Go to the documentation of this file.
1 /* ---------------------------------------------------------------------
2 *
3 * -- PBLAS auxiliary routine (version 2.0) --
4 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5 * and University of California, Berkeley.
6 * April 1, 1998
7 *
8 * ---------------------------------------------------------------------
9 */
10 /*
11 * Include files
12 */
13 #include "../pblas.h"
14 #include "../PBpblas.h"
15 #include "../PBtools.h"
16 #include "../PBblacs.h"
17 #include "../PBblas.h"
18 
19 #ifdef __STDC__
20 void PB_CpdotNN( PBTYP_T * TYPE, int N, char * DOT,
21  char * X, int IX, int JX, int * DESCX, int INCX,
22  char * Y, int IY, int JY, int * DESCY, int INCY,
23  VVDOT_T FDOT )
24 #else
25 void PB_CpdotNN( TYPE, N, DOT, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY,
26  INCY, FDOT )
27 /*
28 * .. Scalar Arguments ..
29 */
30  int INCX, INCY, IX, IY, JX, JY, N;
31  char * DOT;
32  PBTYP_T * TYPE;
33  VVDOT_T FDOT;
34 /*
35 * .. Array Arguments ..
36 */
37  int * DESCX, * DESCY;
38  char * X, * Y;
39 #endif
40 {
41 /*
42 * Purpose
43 * =======
44 *
45 * PB_CpdotNN forms the dot product of two subvectors,
46 *
47 * DOT := sub( X )**T * sub( Y ) or DOT := sub( X )**H * sub( Y ),
48 *
49 * where
50 *
51 * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X,
52 * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and,
53 *
54 * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y,
55 * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y.
56 *
57 * Both subvectors are assumed to be not distributed.
58 *
59 * Notes
60 * =====
61 *
62 * A description vector is associated with each 2D block-cyclicly dis-
63 * tributed matrix. This vector stores the information required to
64 * establish the mapping between a matrix entry and its corresponding
65 * process and memory location.
66 *
67 * In the following comments, the character _ should be read as
68 * "of the distributed matrix". Let A be a generic term for any 2D
69 * block cyclicly distributed matrix. Its description vector is DESC_A:
70 *
71 * NOTATION STORED IN EXPLANATION
72 * ---------------- --------------- ------------------------------------
73 * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
74 * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
75 * the NPROW x NPCOL BLACS process grid
76 * A is distributed over. The context
77 * itself is global, but the handle
78 * (the integer value) may vary.
79 * M_A (global) DESCA[ M_ ] The number of rows in the distribu-
80 * ted matrix A, M_A >= 0.
81 * N_A (global) DESCA[ N_ ] The number of columns in the distri-
82 * buted matrix A, N_A >= 0.
83 * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
84 * block of the matrix A, IMB_A > 0.
85 * INB_A (global) DESCA[ INB_ ] The number of columns of the upper
86 * left block of the matrix A,
87 * INB_A > 0.
88 * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
89 * bute the last M_A-IMB_A rows of A,
90 * MB_A > 0.
91 * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
92 * bute the last N_A-INB_A columns of
93 * A, NB_A > 0.
94 * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
95 * row of the matrix A is distributed,
96 * NPROW > RSRC_A >= 0.
97 * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
98 * first column of A is distributed.
99 * NPCOL > CSRC_A >= 0.
100 * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
101 * array storing the local blocks of
102 * the distributed matrix A,
103 * IF( Lc( 1, N_A ) > 0 )
104 * LLD_A >= MAX( 1, Lr( 1, M_A ) )
105 * ELSE
106 * LLD_A >= 1.
107 *
108 * Let K be the number of rows of a matrix A starting at the global in-
109 * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
110 * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
111 * receive if these K rows were distributed over NPROW processes. If K
112 * is the number of columns of a matrix A starting at the global index
113 * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
114 * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
115 * these K columns were distributed over NPCOL processes.
116 *
117 * The values of Lr() and Lc() may be determined via a call to the func-
118 * tion PB_Cnumroc:
119 * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
120 * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
121 *
122 * Arguments
123 * =========
124 *
125 * TYPE (local input) pointer to a PBTYP_T structure
126 * On entry, TYPE is a pointer to a structure of type PBTYP_T,
127 * that contains type information (See pblas.h).
128 *
129 * N (global input) INTEGER
130 * On entry, N specifies the length of the subvectors to be
131 * multiplied. N must be at least zero.
132 *
133 * DOT (local output) pointer to CHAR
134 * On exit, DOT specifies the dot product of the two subvectors
135 * sub( X ) and sub( Y ) only in their scope (See below for fur-
136 * ther details).
137 *
138 * X (local input) pointer to CHAR
139 * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X
140 * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and
141 * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least
142 * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise.
143 * Before entry, this array contains the local entries of the
144 * matrix X.
145 *
146 * IX (global input) INTEGER
147 * On entry, IX specifies X's global row index, which points to
148 * the beginning of the submatrix sub( X ).
149 *
150 * JX (global input) INTEGER
151 * On entry, JX specifies X's global column index, which points
152 * to the beginning of the submatrix sub( X ).
153 *
154 * DESCX (global and local input) INTEGER array
155 * On entry, DESCX is an integer array of dimension DLEN_. This
156 * is the array descriptor for the matrix X.
157 *
158 * INCX (global input) INTEGER
159 * On entry, INCX specifies the global increment for the
160 * elements of X. Only two values of INCX are supported in
161 * this version, namely 1 and M_X. INCX must not be zero.
162 *
163 * Y (local input) pointer to CHAR
164 * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y
165 * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and
166 * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least
167 * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise.
168 * Before entry, this array contains the local entries of the
169 * matrix Y.
170 *
171 * IY (global input) INTEGER
172 * On entry, IY specifies Y's global row index, which points to
173 * the beginning of the submatrix sub( Y ).
174 *
175 * JY (global input) INTEGER
176 * On entry, JY specifies Y's global column index, which points
177 * to the beginning of the submatrix sub( Y ).
178 *
179 * DESCY (global and local input) INTEGER array
180 * On entry, DESCY is an integer array of dimension DLEN_. This
181 * is the array descriptor for the matrix Y.
182 *
183 * INCY (global input) INTEGER
184 * On entry, INCY specifies the global increment for the
185 * elements of Y. Only two values of INCY are supported in
186 * this version, namely 1 and M_Y. INCY must not be zero.
187 *
188 * FDOT (local input) pointer to a function of type VVDOT
189 * On entry, FDOT points to a subroutine that computes the local
190 * dot product of two vectors.
191 *
192 * Further Details
193 * ===============
194 *
195 * When the result of a vector-oriented PBLAS call is a scalar, this
196 * scalar is set only within the process scope which owns the vector(s)
197 * being operated on. Let sub( X ) be a generic term for the input vec-
198 * tor(s). Then, the processes owning the correct the answer is determi-
199 * ned as follows: if an operation involves more than one vector, the
200 * processes receiving the result will be the union of the following set
201 * of processes for each vector:
202 *
203 * If N = 1, M_X = 1 and INCX = 1, then one cannot determine if a pro-
204 * cess row or process column owns the vector operand, therefore only
205 * the process owning sub( X ) receives the correct result;
206 *
207 * If INCX = M_X, then sub( X ) is a vector distributed over a process
208 * row. Each process in this row receives the result;
209 *
210 * If INCX = 1, then sub( X ) is a vector distributed over a process
211 * column. Each process in this column receives the result;
212 *
213 * -- Written on April 1, 1998 by
214 * Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
215 *
216 * ---------------------------------------------------------------------
217 */
218 /*
219 * .. Local Scalars ..
220 */
221  char Xscope, Yscope, * top;
222  int RRorCC, Xcol, Xii, XisR, XisRow, Xjj, Xld, Xlinc, XmyprocD,
223  XmyprocR, XnprocsR, XprocR, Xrow, Ycol, Yii, YisR, YisRow,
224  Yjj, Yld, Ylinc, YmyprocD, YmyprocR, YnprocsR, YprocR, Yrow,
225  csrc, ctxt, ione=1, mycol, myrow, npcol, nprow, rsrc, size;
226 /*
227 * .. Local Arrays ..
228 */
229  char * buf = NULL;
230 /* ..
231 * .. Executable Statements ..
232 *
233 */
234 /*
235 * Retrieve process grid information
236 */
237  Cblacs_gridinfo( ( ctxt = DESCX[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
238 /*
239 * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol ...
240 */
241  PB_Cinfog2l( IX, JX, DESCX, nprow, npcol, myrow, mycol, &Xii, &Xjj,
242  &Xrow, &Xcol );
243  if( ( XisRow = ( INCX == DESCX[M_] ) ) != 0 )
244  {
245  Xld = DESCX[LLD_]; Xlinc = Xld;
246  XmyprocD = mycol; XprocR = Xrow; XmyprocR = myrow; XnprocsR = nprow;
247  XisR = ( ( Xrow == -1 ) || ( XnprocsR == 1 ) );
248  }
249  else
250  {
251  Xld = DESCX[LLD_]; Xlinc = 1;
252  XmyprocD = myrow; XprocR = Xcol; XmyprocR = mycol; XnprocsR = npcol;
253  XisR = ( ( Xcol == -1 ) || ( XnprocsR == 1 ) );
254  }
255 /*
256 * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol ...
257 */
258  PB_Cinfog2l( IY, JY, DESCY, nprow, npcol, myrow, mycol, &Yii, &Yjj,
259  &Yrow, &Ycol );
260  if( ( YisRow = ( INCY == DESCY[M_] ) ) != 0 )
261  {
262  Yld = DESCY[LLD_]; Ylinc = Yld;
263  YmyprocD = mycol; YprocR = Yrow; YmyprocR = myrow; YnprocsR = nprow;
264  YisR = ( ( Yrow == -1 ) || ( YnprocsR == 1 ) );
265  }
266  else
267  {
268  Yld = DESCY[LLD_]; Ylinc = 1;
269  YmyprocD = myrow; YprocR = Ycol; YmyprocR = mycol; YnprocsR = npcol;
270  YisR = ( ( Ycol == -1 ) || ( YnprocsR == 1 ) );
271  }
272 /*
273 * Are sub( X ) and sub( Y ) both row or column vectors ?
274 */
275  RRorCC = ( ( XisRow && YisRow ) || ( !( XisRow ) && !( YisRow ) ) );
276 /*
277 * Neither sub( X ) nor sub( Y ) are distributed
278 */
279  if( !XisR )
280  {
281 /*
282 * sub( X ) is not replicated
283 */
284  if( !( YisR ) )
285  {
286 /*
287 * sub( Y ) is not replicated
288 */
289  if( ( XmyprocR != XprocR ) && ( YmyprocR != YprocR ) )
290 /*
291 * If I am not in XprocR or YprocR, then return immediately
292 */
293  return;
294 
295  size = TYPE->size;
296 
297  if( RRorCC )
298  {
299 /*
300 * sub( X ) and sub( Y ) are both row or column vectors
301 */
302  if( XprocR == YprocR )
303  {
304 /*
305 * sub( X ) and sub( Y ) are in the same process row or column
306 */
307  FDOT( &N, DOT, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, Mptr( Y,
308  Yii, Yjj, Yld, size ), &Ylinc );
309  }
310  else
311  {
312 /*
313 * sub( X ) and sub( Y ) are in a different process row or column
314 */
315  if( XmyprocR == XprocR )
316  {
317  buf = PB_Cmalloc( N * size );
318 /*
319 * Send sub( X ) to where sub( Y ) resides, and receive sub( Y ) from the same
320 * location.
321 */
322  if( XisRow )
323  {
324  TYPE->Cgesd2d( ctxt, 1, N, Mptr( X, Xii, Xjj, Xld, size ),
325  Xld, YprocR, XmyprocD );
326  TYPE->Cgerv2d( ctxt, 1, N, buf, 1, YprocR, XmyprocD );
327  }
328  else
329  {
330  TYPE->Cgesd2d( ctxt, N, 1, Mptr( X, Xii, Xjj, Xld, size ),
331  Xld, XmyprocD, YprocR );
332  TYPE->Cgerv2d( ctxt, N, 1, buf, N, XmyprocD, YprocR );
333  }
334  FDOT( &N, DOT, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, buf,
335  &ione );
336  if( buf ) free( buf );
337  }
338 
339  if( YmyprocR == YprocR )
340  {
341  buf = PB_Cmalloc( N * size );
342 /*
343 * Send sub( Y ) to where sub( X ) resides, and receive sub( X ) from the same
344 * location.
345 */
346  if( YisRow )
347  {
348  TYPE->Cgesd2d( ctxt, 1, N, Mptr( Y, Yii, Yjj, Yld, size ),
349  Yld, XprocR, YmyprocD );
350  TYPE->Cgerv2d( ctxt, 1, N, buf, 1, XprocR, YmyprocD );
351  }
352  else
353  {
354  TYPE->Cgesd2d( ctxt, N, 1, Mptr( Y, Yii, Yjj, Yld, size ),
355  Yld, YmyprocD, XprocR );
356  TYPE->Cgerv2d( ctxt, N, 1, buf, N, YmyprocD, XprocR );
357  }
358  FDOT( &N, DOT, buf, &ione, Mptr( Y, Yii, Yjj, Yld, size ),
359  &Ylinc );
360  if( buf ) free( buf );
361  }
362  }
363  }
364  else
365  {
366 /*
367 * sub( X ) and sub( Y ) are not both row or column vectors
368 */
369  if( ( XmyprocR == XprocR ) && ( YmyprocR == YprocR ) )
370  {
371 /*
372 * If I am at the intersection of the process row and column, then compute the
373 * dot product and broadcast it in my process row and column.
374 */
375  FDOT( &N, DOT, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, Mptr( Y,
376  Yii, Yjj, Yld, size ), &Ylinc );
377  top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
378  TYPE->Cgebs2d( ctxt, ROW, top, 1, 1, DOT, 1 );
379  top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
380  TYPE->Cgebs2d( ctxt, COLUMN, top, 1, 1, DOT, 1 );
381  }
382  else if( XmyprocR == XprocR )
383  {
384  if( XisRow ) { Xscope = CROW; rsrc = XprocR; csrc = YprocR; }
385  else { Xscope = CCOLUMN; rsrc = YprocR; csrc = XprocR; }
386  top = PB_Ctop( &ctxt, BCAST, &Xscope, TOP_GET );
387  TYPE->Cgebr2d( ctxt, &Xscope, top, 1, 1, DOT, 1, rsrc, csrc );
388  }
389  else if( YmyprocR == YprocR )
390  {
391  if( YisRow ) { Yscope = CROW; rsrc = YprocR; csrc = XprocR; }
392  else { Yscope = CCOLUMN; rsrc = XprocR; csrc = YprocR; }
393  top = PB_Ctop( &ctxt, BCAST, &Yscope, TOP_GET );
394  TYPE->Cgebr2d( ctxt, &Yscope, top, 1, 1, DOT, 1, rsrc, csrc );
395  }
396  }
397  }
398  else
399  {
400 /*
401 * sub( Y ) is replicated
402 */
403  if( XmyprocR == XprocR )
404  {
405 /*
406 * If I am in the process row (resp. column) owning sub( X ), then compute the
407 * dot product and broadcast in my column (resp. row).
408 */
409  size = TYPE->size;
410 
411  FDOT( &N, DOT, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, Mptr( Y, Yii,
412  Yjj, Yld, size ), &Ylinc );
413 
414  if( XisRow )
415  {
416  top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
417  TYPE->Cgebs2d( ctxt, COLUMN, top, 1, 1, DOT, 1 );
418  }
419  else
420  {
421  top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
422  TYPE->Cgebs2d( ctxt, ROW, top, 1, 1, DOT, 1 );
423  }
424  }
425  else
426  {
427 /*
428 * Otherwise, receive the dot product
429 */
430  if( XisRow )
431  {
432  top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
433  TYPE->Cgebr2d( ctxt, COLUMN, top, 1, 1, DOT, 1, XprocR,
434  XmyprocD );
435  }
436  else
437  {
438  top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
439  TYPE->Cgebr2d( ctxt, ROW, top, 1, 1, DOT, 1, XmyprocD,
440  XprocR );
441  }
442  }
443  }
444  }
445  else
446  {
447 /*
448 * sub( X ) is replicated
449 */
450  if( YisR || ( YmyprocR == YprocR ) )
451  {
452 /*
453 * If I own a piece of sub( Y ), then compute the dot product
454 */
455  size = TYPE->size;
456 
457  FDOT( &N, DOT, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, Mptr( Y, Yii,
458  Yjj, Yld, size ), &Ylinc );
459  }
460 
461  if( !YisR )
462  {
463 /*
464 * If sub( Y ) is not replicated, then broadcast the result to the other
465 * processes that own a piece of sub( X ), but were not involved in the above
466 * dot-product computation.
467 */
468  if( YisRow )
469  {
470  top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
471  if( YmyprocR == YprocR )
472  TYPE->Cgebs2d( ctxt, COLUMN, top, 1, 1, DOT, 1 );
473  else
474  TYPE->Cgebr2d( ctxt, COLUMN, top, 1, 1, DOT, 1, YprocR,
475  YmyprocD );
476  }
477  else
478  {
479  top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
480  if( YmyprocR == YprocR )
481  TYPE->Cgebs2d( ctxt, ROW, top, 1, 1, DOT, 1 );
482  else
483  TYPE->Cgebr2d( ctxt, ROW, top, 1, 1, DOT, 1, YmyprocD, YprocR );
484  }
485  }
486  }
487 /*
488 * End of PB_CpdotNN
489 */
490 }
M_
#define M_
Definition: PBtools.h:39
TYPE
#define TYPE
Definition: clamov.c:7
ROW
#define ROW
Definition: PBblacs.h:46
COLUMN
#define COLUMN
Definition: PBblacs.h:45
PB_CpdotNN
void PB_CpdotNN(PBTYP_T *TYPE, int N, char *DOT, char *X, int IX, int JX, int *DESCX, int INCX, char *Y, int IY, int JY, int *DESCY, int INCY, VVDOT_T FDOT)
Definition: PB_CpdotNN.c:25
LLD_
#define LLD_
Definition: PBtools.h:47
CROW
#define CROW
Definition: PBblacs.h:21
TOP_GET
#define TOP_GET
Definition: PBblacs.h:50
PB_Ctop
char * PB_Ctop()
BCAST
#define BCAST
Definition: PBblacs.h:48
VVDOT_T
F_VOID_FCT(* VVDOT_T)()
Definition: pblas.h:286
PB_Cinfog2l
void PB_Cinfog2l()
PB_Cmalloc
char * PB_Cmalloc()
CCOLUMN
#define CCOLUMN
Definition: PBblacs.h:20
Cblacs_gridinfo
void Cblacs_gridinfo()
PBTYP_T
Definition: pblas.h:325
Mptr
#define Mptr(a_, i_, j_, lda_, siz_)
Definition: PBtools.h:132
CTXT_
#define CTXT_
Definition: PBtools.h:38