ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
pcdotu_.c
Go to the documentation of this file.
1 /* ---------------------------------------------------------------------
2 *
3 * -- PBLAS routine (version 2.0) --
4 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5 * and University of California, Berkeley.
6 * April 1, 1998
7 *
8 * ---------------------------------------------------------------------
9 */
10 /*
11 * Include files
12 */
13 #include "pblas.h"
14 #include "PBpblas.h"
15 #include "PBtools.h"
16 #include "PBblacs.h"
17 #include "PBblas.h"
18 
19 #ifdef __STDC__
20 void pcdotu_( int * N,
21  float * DOT,
22  float * X, int * IX, int * JX, int * DESCX, int * INCX,
23  float * Y, int * IY, int * JY, int * DESCY, int * INCY )
24 #else
25 void pcdotu_( N, DOT, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY )
26 /*
27 * .. Scalar Arguments ..
28 */
29  int * INCX, * INCY, * IX, * IY, * JX, * JY, * N;
30  float * DOT;
31 /*
32 * .. Array Arguments ..
33 */
34  int * DESCX, * DESCY;
35  float * X, * Y;
36 #endif
37 {
38 /*
39 * Purpose
40 * =======
41 *
42 * PCDOTU forms the dot product of two subvectors,
43 *
44 * DOT := sub( X )**T * sub( Y ),
45 *
46 * where
47 *
48 * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X,
49 * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and,
50 *
51 * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y,
52 * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y.
53 *
54 * Notes
55 * =====
56 *
57 * A description vector is associated with each 2D block-cyclicly dis-
58 * tributed matrix. This vector stores the information required to
59 * establish the mapping between a matrix entry and its corresponding
60 * process and memory location.
61 *
62 * In the following comments, the character _ should be read as
63 * "of the distributed matrix". Let A be a generic term for any 2D
64 * block cyclicly distributed matrix. Its description vector is DESC_A:
65 *
66 * NOTATION STORED IN EXPLANATION
67 * ---------------- --------------- ------------------------------------
68 * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
69 * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
70 * the NPROW x NPCOL BLACS process grid
71 * A is distributed over. The context
72 * itself is global, but the handle
73 * (the integer value) may vary.
74 * M_A (global) DESCA[ M_ ] The number of rows in the distribu-
75 * ted matrix A, M_A >= 0.
76 * N_A (global) DESCA[ N_ ] The number of columns in the distri-
77 * buted matrix A, N_A >= 0.
78 * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
79 * block of the matrix A, IMB_A > 0.
80 * INB_A (global) DESCA[ INB_ ] The number of columns of the upper
81 * left block of the matrix A,
82 * INB_A > 0.
83 * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
84 * bute the last M_A-IMB_A rows of A,
85 * MB_A > 0.
86 * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
87 * bute the last N_A-INB_A columns of
88 * A, NB_A > 0.
89 * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
90 * row of the matrix A is distributed,
91 * NPROW > RSRC_A >= 0.
92 * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
93 * first column of A is distributed.
94 * NPCOL > CSRC_A >= 0.
95 * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
96 * array storing the local blocks of
97 * the distributed matrix A,
98 * IF( Lc( 1, N_A ) > 0 )
99 * LLD_A >= MAX( 1, Lr( 1, M_A ) )
100 * ELSE
101 * LLD_A >= 1.
102 *
103 * Let K be the number of rows of a matrix A starting at the global in-
104 * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
105 * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
106 * receive if these K rows were distributed over NPROW processes. If K
107 * is the number of columns of a matrix A starting at the global index
108 * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
109 * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
110 * these K columns were distributed over NPCOL processes.
111 *
112 * The values of Lr() and Lc() may be determined via a call to the func-
113 * tion PB_Cnumroc:
114 * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
115 * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
116 *
117 * Arguments
118 * =========
119 *
120 * N (global input) INTEGER
121 * On entry, N specifies the length of the subvectors to be
122 * multiplied. N must be at least zero.
123 *
124 * DOT (local output) COMPLEX array
125 * On exit, DOT specifies the dot product of the two subvectors
126 * sub( X ) and sub( Y ) only in their scope (See below for fur-
127 * ther details).
128 *
129 * X (local input) COMPLEX array
130 * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X
131 * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and
132 * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least
133 * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise.
134 * Before entry, this array contains the local entries of the
135 * matrix X.
136 *
137 * IX (global input) INTEGER
138 * On entry, IX specifies X's global row index, which points to
139 * the beginning of the submatrix sub( X ).
140 *
141 * JX (global input) INTEGER
142 * On entry, JX specifies X's global column index, which points
143 * to the beginning of the submatrix sub( X ).
144 *
145 * DESCX (global and local input) INTEGER array
146 * On entry, DESCX is an integer array of dimension DLEN_. This
147 * is the array descriptor for the matrix X.
148 *
149 * INCX (global input) INTEGER
150 * On entry, INCX specifies the global increment for the
151 * elements of X. Only two values of INCX are supported in
152 * this version, namely 1 and M_X. INCX must not be zero.
153 *
154 * Y (local input) COMPLEX array
155 * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y
156 * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and
157 * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least
158 * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise.
159 * Before entry, this array contains the local entries of the
160 * matrix Y.
161 *
162 * IY (global input) INTEGER
163 * On entry, IY specifies Y's global row index, which points to
164 * the beginning of the submatrix sub( Y ).
165 *
166 * JY (global input) INTEGER
167 * On entry, JY specifies Y's global column index, which points
168 * to the beginning of the submatrix sub( Y ).
169 *
170 * DESCY (global and local input) INTEGER array
171 * On entry, DESCY is an integer array of dimension DLEN_. This
172 * is the array descriptor for the matrix Y.
173 *
174 * INCY (global input) INTEGER
175 * On entry, INCY specifies the global increment for the
176 * elements of Y. Only two values of INCY are supported in
177 * this version, namely 1 and M_Y. INCY must not be zero.
178 *
179 * Further Details
180 * ===============
181 *
182 * When the result of a vector-oriented PBLAS call is a scalar, this
183 * scalar is set only within the process scope which owns the vector(s)
184 * being operated on. Let sub( X ) be a generic term for the input vec-
185 * tor(s). Then, the processes owning the correct the answer is determi-
186 * ned as follows: if an operation involves more than one vector, the
187 * processes receiving the result will be the union of the following set
188 * of processes for each vector:
189 *
190 * If N = 1, M_X = 1 and INCX = 1, then one cannot determine if a pro-
191 * cess row or process column owns the vector operand, therefore only
192 * the process owning sub( X ) receives the correct result;
193 *
194 * If INCX = M_X, then sub( X ) is a vector distributed over a process
195 * row. Each process in this row receives the result;
196 *
197 * If INCX = 1, then sub( X ) is a vector distributed over a process
198 * column. Each process in this column receives the result;
199 *
200 * -- Written on April 1, 1998 by
201 * Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
202 *
203 * ---------------------------------------------------------------------
204 */
205 /*
206 * .. Local Scalars ..
207 */
208  char scope, * top;
209  int OneBlock, OneDgrid, RRorCC, Square, Xcol, Xi, Xii, XinbD,
210  Xinb1D, XisD, XisR, XisRow, Xj, Xjj, Xld, Xlinc, XmyprocD,
211  XmyprocR, XnbD, XnpD, XnprocsD, XnprocsR, XprocD, XprocR,
212  Xrow, Ycol, Yi, Yii, YinbD, Yinb1D, YisD, YisR, YisRow, Yj,
213  Yjj, Yld, Ylinc, YmyprocD, YmyprocR, YnbD, YnpD, YnprocsD,
214  YnprocsR, YprocD, YprocR, Yrow, cdst, csrc, ctxt, dst, info,
215  ione=1, mycol, myrow, npcol, nprow, rdst, rsrc, size, src;
216  PBTYP_T * type;
217  VVDOT_T dot;
218 /*
219 * .. Local Arrays ..
220 */
221  char * buf = NULL;
222  int Xd[DLEN_], Yd[DLEN_], dbuf[ DLEN_ ];
223 /* ..
224 * .. Executable Statements ..
225 *
226 */
227  PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd );
228  PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd );
229 #ifndef NO_ARGCHK
230 /*
231 * Test the input parameters
232 */
233  Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
234  if( !( info = ( ( nprow == -1 ) ? -( 601 + CTXT_ ) : 0 ) ) )
235  {
236  PB_Cchkvec( ctxt, "PCDOTU", "X", *N, 1, Xi, Xj, Xd, *INCX, 6, &info );
237  PB_Cchkvec( ctxt, "PCDOTU", "Y", *N, 1, Yi, Yj, Yd, *INCY, 11, &info );
238  }
239  if( info ) { PB_Cabort( ctxt, "PCDOTU", info ); return; }
240 #endif
241  DOT[REAL_PART] = ZERO;
242  DOT[IMAG_PART] = ZERO;
243 /*
244 * Quick return if possible
245 */
246  if( *N == 0 ) return;
247 /*
248 * Handle degenerate case
249 */
250  if( ( *N == 1 ) && ( ( Xd[ M_ ] == 1 ) || ( Yd[ M_ ] == 1 ) ) )
251  {
252  type = PB_Cctypeset();
253  PB_Cpdot11( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX,
254  ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotu );
255  return;
256  }
257 /*
258 * Start the operations
259 */
260 #ifdef NO_ARGCHK
261  Cblacs_gridinfo( ( ctxt = Xd[ CTXT_ ] ), &nprow, &npcol, &myrow, &mycol );
262 #endif
263 /*
264 * Determine if sub( X ) is distributed or not
265 */
266  if( ( XisRow = ( *INCX == Xd[M_] ) ) != 0 )
267  XisD = ( ( Xd[CSRC_] >= 0 ) && ( ( XnprocsD = npcol ) > 1 ) );
268  else
269  XisD = ( ( Xd[RSRC_] >= 0 ) && ( ( XnprocsD = nprow ) > 1 ) );
270 /*
271 * Determine if sub( Y ) is distributed or not
272 */
273  if( ( YisRow = ( *INCY == Yd[M_] ) ) != 0 )
274  YisD = ( ( Yd[CSRC_] >= 0 ) && ( ( YnprocsD = npcol ) > 1 ) );
275  else
276  YisD = ( ( Yd[RSRC_] >= 0 ) && ( ( YnprocsD = nprow ) > 1 ) );
277 /*
278 * Are sub( X ) and sub( Y ) both row or column vectors ?
279 */
280  RRorCC = ( ( XisRow && YisRow ) || ( !( XisRow ) && !( YisRow ) ) );
281 /*
282 * XisD && YisD <=> both vector operands are indeed distributed
283 */
284  if( XisD && YisD )
285  {
286 /*
287 * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol
288 */
289  PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj,
290  &Xrow, &Xcol );
291  if( XisRow )
292  {
293  XinbD = Xd[INB_]; XnbD = Xd[NB_];
294  Xld = Xd[LLD_]; Xlinc = Xld;
295  XprocD = Xcol; XmyprocD = mycol;
296  XprocR = Xrow; XmyprocR = myrow; XnprocsR = nprow;
297  XisR = ( ( Xrow == -1 ) || ( XnprocsR == 1 ) );
298  Mfirstnb( Xinb1D, *N, Xj, XinbD, XnbD );
299  }
300  else
301  {
302  XinbD = Xd[IMB_]; XnbD = Xd[MB_];
303  Xld = Xd[LLD_]; Xlinc = 1;
304  XprocD = Xrow; XmyprocD = myrow;
305  XprocR = Xcol; XmyprocR = mycol; XnprocsR = npcol;
306  XisR = ( ( Xcol == -1 ) || ( XnprocsR == 1 ) );
307  Mfirstnb( Xinb1D, *N, Xi, XinbD, XnbD );
308  }
309 /*
310 * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol
311 */
312  PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj,
313  &Yrow, &Ycol );
314  if( YisRow )
315  {
316  YinbD = Yd[INB_]; YnbD = Yd[NB_];
317  Yld = Yd[LLD_]; Ylinc = Yld;
318  YprocD = Ycol; YmyprocD = mycol;
319  YprocR = Yrow; YmyprocR = myrow; YnprocsR = nprow;
320  YisR = ( ( Yrow == -1 ) || ( YnprocsR == 1 ) );
321  Mfirstnb( Yinb1D, *N, Yj, YinbD, YnbD );
322  }
323  else
324  {
325  YinbD = Yd[IMB_]; YnbD = Yd[MB_];
326  Yld = Yd[LLD_]; Ylinc = 1;
327  YprocD = Yrow; YmyprocD = myrow;
328  YprocR = Ycol; YmyprocR = mycol; YnprocsR = npcol;
329  YisR = ( ( Ycol == -1 ) || ( YnprocsR == 1 ) );
330  Mfirstnb( Yinb1D, *N, Yi, YinbD, YnbD );
331  }
332 /*
333 * Do sub( X ) and sub( Y ) span more than one process ?
334 */
335  OneDgrid = ( ( XnprocsD == 1 ) && ( YnprocsD == 1 ) );
336  OneBlock = ( ( Xinb1D >= *N ) && ( Yinb1D >= *N ) );
337 /*
338 * Are sub( X ) and sub( Y ) distributed in the same manner ?
339 */
340  Square = ( ( Xinb1D == Yinb1D ) && ( XnbD == YnbD ) &&
341  ( XnprocsD == YnprocsD ) );
342 
343  if( !( XisR ) )
344  {
345 /*
346 * sub( X ) is not replicated
347 */
348  if( YisR )
349  {
350 /*
351 * If sub( X ) is not replicated, but sub( Y ) is, a process row or column
352 * YprocR need to be selected. It will contain the non-replicated vector used
353 * to perform the dot product computation.
354 */
355  if( RRorCC )
356  {
357 /*
358 * sub( X ) and sub( Y ) are both row or column vectors
359 */
360  if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) )
361  {
362 /*
363 * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD.
364 * Enforce a purely local operation by choosing YprocR to be equal to XprocR.
365 */
366  YprocR = XprocR;
367  }
368  else
369  {
370 /*
371 * Otherwise, communication has to occur, so choose the next process row or
372 * column for YprocR to maximize the number of links, i.e reduce contention.
373 */
374  YprocR = MModAdd1( XprocR, XnprocsR );
375  }
376  }
377  else
378  {
379 /*
380 * sub( X ) and sub( Y ) are distributed in orthogonal directions, what is
381 * chosen for YprocR does not really matter. Select the process origin.
382 */
383  YprocR = XprocD;
384  }
385  }
386  else
387  {
388 /*
389 * Neither sub( X ) nor sub( Y ) are replicated. If I am not in process row or
390 * column XprocR and not in process row or column YprocR, then quick return.
391 */
392  if( ( XmyprocR != XprocR ) && ( YmyprocR != YprocR ) )
393  return;
394  }
395  }
396  else
397  {
398 /*
399 * sub( X ) is distributed and replicated (so no quick return possible)
400 */
401  if( YisR )
402  {
403 /*
404 * sub( Y ) is distributed and replicated as well
405 */
406  if( RRorCC )
407  {
408 /*
409 * sub( X ) and sub( Y ) are both row or column vectors
410 */
411  if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) )
412  {
413 /*
414 * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD.
415 * Enforce a purely local operation by choosing XprocR and YprocR to be equal
416 * to zero.
417 */
418  XprocR = YprocR = 0;
419  }
420  else
421  {
422 /*
423 * Otherwise, communication has to occur, so select YprocR to be zero and the
424 * next process row or column for XprocR in order to maximize the number of
425 * used links, i.e reduce contention.
426 */
427  YprocR = 0;
428  XprocR = MModAdd1( YprocR, YnprocsR );
429  }
430  }
431  else
432  {
433 /*
434 * sub( X ) and sub( Y ) are distributed in orthogonal directions, select the
435 * origin processes.
436 */
437  XprocR = YprocD;
438  YprocR = XprocD;
439  }
440  }
441  else
442  {
443 /*
444 * sub( Y ) is distributed, but not replicated
445 */
446  if( RRorCC )
447  {
448 /*
449 * sub( X ) and sub( Y ) are both row or column vectors
450 */
451  if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) )
452  {
453 /*
454 * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD.
455 * Enforce a purely local operation by choosing XprocR to be equal to YprocR.
456 */
457  XprocR = YprocR;
458  }
459  else
460  {
461 /*
462 * Otherwise, communication has to occur, so choose the next process row or
463 * column for XprocR to maximize the number of links, i.e reduce contention.
464 */
465  XprocR = MModAdd1( YprocR, YnprocsR );
466  }
467  }
468  else
469  {
470 /*
471 * sub( X ) and sub( Y ) are distributed in orthogonal directions, what is
472 * chosen for XprocR does not really matter. Select the origin process.
473 */
474  XprocR = YprocD;
475  }
476  }
477  }
478 /*
479 * Even if sub( X ) and/or sub( Y ) are replicated, only two process row or
480 * column are active, namely XprocR and YprocR. If any of those operands is
481 * replicated, broadcast will occur (unless there is an easy way out).
482 */
483  type = PB_Cctypeset(); size = type->size; dot = type->Fvvdotu;
484 /*
485 * A purely operation occurs iff the operands start in the same process and if
486 * either the grid is mono-dimensional or there is a single local block to be
487 * operated with or if both operands are aligned.
488 */
489  if( ( ( RRorCC && ( XprocD == YprocD ) && ( XprocR == YprocR ) ) ||
490  ( !( RRorCC ) && ( XprocD == YprocR ) && ( XprocR == YprocD ) ) ) &&
491  ( OneDgrid || OneBlock || ( RRorCC && Square ) ) )
492  {
493  if( ( !XisR && ( XmyprocR == XprocR ) &&
494  !YisR && ( YmyprocR == YprocR ) ) ||
495  ( !XisR && YisR && ( YmyprocR == YprocR ) ) ||
496  ( !YisR && XisR && ( XmyprocR == XprocR ) ) ||
497  ( XisR && YisR ) )
498  {
499  XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD,
500  XnprocsD );
501  YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD,
502  YnprocsD );
503  if( ( XnpD > 0 ) && ( YnpD > 0 ) )
504  {
505  dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld,
506  size ), &Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld, size ),
507  &Ylinc );
508  }
509  }
510 /*
511 * Combine the local results in sub( X )'s scope
512 */
513  if( ( XisR && YisR ) || ( XmyprocR == XprocR ) )
514  {
515  scope = ( XisRow ? CROW : CCOLUMN );
516  top = PB_Ctop( &ctxt, COMBINE, &scope, TOP_GET );
517  Ccgsum2d( ctxt, &scope, top, 1, 1, ((char *) DOT), 1, -1, 0 );
518  }
519  if( RRorCC && XisR && YisR ) return;
520  }
521  else if( ( RRorCC && OneDgrid ) || OneBlock || Square )
522  {
523 /*
524 * Otherwise, it may be possible to compute the desired dot-product in a single
525 * message exchange iff the grid is mono-dimensional and the operands are
526 * distributed in the same direction, or there is just one block to be exchanged
527 * or if both operands are similarly distributed in their respective direction.
528 */
529  if( ( YmyprocR == YprocR ) )
530  {
531 /*
532 * The processes owning a piece of sub( Y ) send it to the corresponding
533 * process owning s piece of sub ( X ).
534 */
535  YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD,
536  YnprocsD );
537  if( YnpD > 0 )
538  {
539  dst = XprocD + MModSub( YmyprocD, YprocD, YnprocsD );
540  dst = MPosMod( dst, XnprocsD );
541  if( XisRow ) { rdst = XprocR; cdst = dst; }
542  else { rdst = dst; cdst = XprocR; }
543 
544  if( ( myrow == rdst ) && ( mycol == cdst ) )
545  {
546  dot( &YnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld,
547  size ), &Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld,
548  size ), &Ylinc );
549  }
550  else
551  {
552  if( YisRow )
553  Ccgesd2d( ctxt, 1, YnpD, Mptr( ((char *) Y), Yii, Yjj,
554  Yld, size ), Yld, rdst, cdst );
555  else
556  Ccgesd2d( ctxt, YnpD, 1, Mptr( ((char *) Y), Yii, Yjj,
557  Yld, size ), Yld, rdst, cdst );
558  }
559  }
560  }
561  if( XmyprocR == XprocR )
562  {
563 /*
564 * The processes owning a piece of sub( X ) receive the corresponding local
565 * piece of sub( Y ), compute the local dot product and combine the results
566 * within sub( X )'s scope.
567 */
568  XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD,
569  XnprocsD );
570  if( XnpD > 0 )
571  {
572  src = YprocD + MModSub( XmyprocD, XprocD, XnprocsD );
573  src = MPosMod( src, YnprocsD );
574  if( YisRow ) { rsrc = YprocR; csrc = src; }
575  else { rsrc = src; csrc = YprocR; }
576  if( ( myrow != rsrc ) || ( mycol != csrc ) )
577  {
578  buf = PB_Cmalloc( XnpD * size );
579  if( YisRow )
580  Ccgerv2d( ctxt, 1, XnpD, buf, 1, rsrc, csrc );
581  else
582  Ccgerv2d( ctxt, XnpD, 1, buf, XnpD, rsrc, csrc );
583  dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld,
584  size ), &Xlinc, buf, &ione );
585  if( buf ) free( buf );
586  }
587  }
588  if( XisRow )
589  {
590  top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
591  Ccgsum2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, -1, 0 );
592  }
593  else
594  {
595  top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
596  Ccgsum2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, -1, 0 );
597  }
598  }
599  }
600  else
601  {
602 /*
603 * General case, copy sub( Y ) within sub( X )'s scope, compute the local
604 * results and combine them within sub( X )'s scope.
605 */
606  XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD );
607 
608  if( XisRow )
609  {
610  PB_Cdescset( dbuf, 1, *N, 1, Xinb1D, 1, XnbD, XprocR, XprocD, ctxt,
611  1 );
612  }
613  else
614  {
615  PB_Cdescset( dbuf, *N, 1, Xinb1D, 1, XnbD, 1, XprocD, XprocR, ctxt,
616  MAX( 1, XnpD ) );
617  }
618  if( ( XmyprocR == XprocR ) && ( XnpD > 0 ) )
619  buf = PB_Cmalloc( XnpD * size );
620 
621  if( YisRow )
622  {
623  PB_Cpaxpby( type, NOCONJG, 1, *N, type->one, ((char *) Y), Yi, Yj,
624  Yd, ROW, type->zero, buf, 0, 0, dbuf, ( XisRow ? ROW :
625  COLUMN ) );
626  }
627  else
628  {
629  PB_Cpaxpby( type, NOCONJG, *N, 1, type->one, ((char *) Y), Yi, Yj,
630  Yd, COLUMN, type->zero, buf, 0, 0, dbuf, ( XisRow ?
631  ROW : COLUMN ) );
632  }
633 
634  if( XmyprocR == XprocR )
635  {
636  if( XnpD > 0 )
637  {
638  dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld,
639  size ), &Xlinc, buf, &ione );
640  if( buf ) free( buf );
641  }
642  if( XisRow )
643  {
644  top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
645  Ccgsum2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, -1, 0 );
646  }
647  else
648  {
649  top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
650  Ccgsum2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, -1, 0 );
651  }
652  }
653  }
654 /*
655 * Send the DOT product result within sub( Y )'s scope
656 */
657  if( XisR || YisR )
658  {
659 /*
660 * Either sub( X ) or sub( Y ) are replicated, so that every process should have
661 * the result -> broadcast it orthogonally from sub( X )'s direction.
662 */
663  if( XisRow )
664  {
665  top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
666  if( XmyprocR == XprocR )
667  Ccgebs2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1 );
668  else
669  Ccgebr2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, XprocR,
670  XmyprocD );
671  }
672  else
673  {
674  top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
675  if( XmyprocR == XprocR )
676  Ccgebs2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1 );
677  else
678  Ccgebr2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, XmyprocD,
679  XprocR );
680  }
681  }
682  else
683  {
684 /*
685 * Neither sub( X ) nor sub( Y ) are replicated
686 */
687  if( RRorCC )
688  {
689 /*
690 * Both sub( X ) are distributed in the same direction -> the process row or
691 * column XprocR sends the result to the process row or column YprocR.
692 */
693  if( XprocR != YprocR )
694  {
695  if( XmyprocR == XprocR )
696  {
697  if( XisRow )
698  Ccgesd2d( ctxt, 1, 1, ((char *) DOT), 1, YprocR,
699  YmyprocD );
700  else
701  Ccgesd2d( ctxt, 1, 1, ((char *) DOT), 1, YmyprocD,
702  YprocR );
703  }
704  else if( YmyprocR == YprocR )
705  {
706  if( XisRow )
707  Ccgerv2d( ctxt, 1, 1, ((char *) DOT), 1, XprocR,
708  XmyprocD );
709  else
710  Ccgerv2d( ctxt, 1, 1, ((char *) DOT), 1, XmyprocD,
711  XprocR );
712  }
713  }
714  }
715  else
716  {
717 /*
718 * Otherwise, the process at the intersection of sub( X )'s and sub( Y )'s
719 * scope, broadcast the result within sub( Y )'s scope.
720 */
721  if( YmyprocR == YprocR )
722  {
723  if( YisRow )
724  {
725  top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
726  if( YmyprocD == XprocR )
727  Ccgebs2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1 );
728  else
729  Ccgebr2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1,
730  YprocR, XprocR );
731  }
732  else
733  {
734  top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
735  if( YmyprocD == XprocR )
736  Ccgebs2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1 );
737  else
738  Ccgebr2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1,
739  XprocR, YprocR );
740  }
741  }
742  }
743  }
744  }
745  else if( !( XisD ) && YisD )
746  {
747 /*
748 * sub( X ) is not distributed and sub( Y ) is distributed.
749 */
750  type = PB_Cctypeset();
751  PB_CpdotND( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX,
752  ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotu );
753  }
754  else if( XisD && !( YisD ) )
755  {
756 /*
757 * sub( X ) is distributed and sub( Y ) is not distributed.
758 */
759  type = PB_Cctypeset();
760  PB_CpdotND( type, *N, ((char *) DOT), ((char *) Y), Yi, Yj, Yd, *INCY,
761  ((char *) X), Xi, Xj, Xd, *INCX, type->Fvvdotu );
762  }
763  else
764  {
765 /*
766 * Neither sub( X ) nor sub( Y ) are distributed
767 */
768  type = PB_Cctypeset();
769  PB_CpdotNN( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX,
770  ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotu );
771  }
772 /*
773 * End of PCDOTU
774 */
775 }
M_
#define M_
Definition: PBtools.h:39
ROW
#define ROW
Definition: PBblacs.h:46
MB_
#define MB_
Definition: PBtools.h:43
PB_Cpaxpby
void PB_Cpaxpby()
Ccgsum2d
void Ccgsum2d()
NB_
#define NB_
Definition: PBtools.h:44
COLUMN
#define COLUMN
Definition: PBblacs.h:45
CSRC_
#define CSRC_
Definition: PBtools.h:46
PB_CpdotND
void PB_CpdotND()
PBblacs.h
PBtools.h
PBblas.h
NOCONJG
#define NOCONJG
Definition: PBblas.h:45
REAL_PART
#define REAL_PART
Definition: pblas.h:135
PBpblas.h
DLEN_
#define DLEN_
Definition: PBtools.h:48
MPosMod
#define MPosMod(I, d)
Definition: PBtools.h:95
pcdotu_
void pcdotu_(int *N, float *DOT, float *X, int *IX, int *JX, int *DESCX, int *INCX, float *Y, int *IY, int *JY, int *DESCY, int *INCY)
Definition: pcdotu_.c:25
LLD_
#define LLD_
Definition: PBtools.h:47
CROW
#define CROW
Definition: PBblacs.h:21
ZERO
#define ZERO
Definition: PBtools.h:66
PB_Cchkvec
void PB_Cchkvec()
IMB_
#define IMB_
Definition: PBtools.h:41
MModSub
#define MModSub(I1, I2, d)
Definition: PBtools.h:102
PB_Cdescset
void PB_Cdescset()
Ccgebs2d
void Ccgebs2d()
PB_Cabort
void PB_Cabort()
MModAdd1
#define MModAdd1(I, d)
Definition: PBtools.h:100
PB_CpdotNN
void PB_CpdotNN()
TOP_GET
#define TOP_GET
Definition: PBblacs.h:50
Mfirstnb
#define Mfirstnb(inbt_, n_, i_, inb_, nb_)
Definition: PBtools.h:139
PB_Ctop
char * PB_Ctop()
RSRC_
#define RSRC_
Definition: PBtools.h:45
Ccgerv2d
void Ccgerv2d()
PBTYP_T::one
char * one
Definition: pblas.h:331
PB_Cpdot11
void PB_Cpdot11()
PB_CargFtoC
void PB_CargFtoC()
BCAST
#define BCAST
Definition: PBblacs.h:48
VVDOT_T
F_VOID_FCT(* VVDOT_T)()
Definition: pblas.h:286
COMBINE
#define COMBINE
Definition: PBblacs.h:49
PBTYP_T::size
int size
Definition: pblas.h:329
PB_Cinfog2l
void PB_Cinfog2l()
PB_Cnumroc
int PB_Cnumroc()
PB_Cmalloc
char * PB_Cmalloc()
CCOLUMN
#define CCOLUMN
Definition: PBblacs.h:20
INB_
#define INB_
Definition: PBtools.h:42
Ccgebr2d
void Ccgebr2d()
PBTYP_T::Fvvdotu
VVDOT_T Fvvdotu
Definition: pblas.h:353
MAX
#define MAX(a_, b_)
Definition: PBtools.h:77
Cblacs_gridinfo
void Cblacs_gridinfo()
PBTYP_T
Definition: pblas.h:325
pblas.h
Mptr
#define Mptr(a_, i_, j_, lda_, siz_)
Definition: PBtools.h:132
CTXT_
#define CTXT_
Definition: PBtools.h:38
IMAG_PART
#define IMAG_PART
Definition: pblas.h:136
PB_Cctypeset
PBTYP_T * PB_Cctypeset()
Definition: PB_Cctypeset.c:19
PBTYP_T::zero
char * zero
Definition: pblas.h:331
Ccgesd2d
void Ccgesd2d()