SCALAPACK 2.2.2
LAPACK: Linear Algebra PACKage
Loading...
Searching...
No Matches
pcdotu_.c
Go to the documentation of this file.
1/* ---------------------------------------------------------------------
2*
3* -- PBLAS routine (version 2.0) --
4* University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5* and University of California, Berkeley.
6* April 1, 1998
7*
8* ---------------------------------------------------------------------
9*/
10/*
11* Include files
12*/
13#include "pblas.h"
14#include "PBpblas.h"
15#include "PBtools.h"
16#include "PBblacs.h"
17#include "PBblas.h"
18
19#ifdef __STDC__
20void pcdotu_( Int * N,
21 float * DOT,
22 float * X, Int * IX, Int * JX, Int * DESCX, Int * INCX,
23 float * Y, Int * IY, Int * JY, Int * DESCY, Int * INCY )
24#else
25void pcdotu_( N, DOT, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY )
26/*
27* .. Scalar Arguments ..
28*/
29 Int * INCX, * INCY, * IX, * IY, * JX, * JY, * N;
30 float * DOT;
31/*
32* .. Array Arguments ..
33*/
34 Int * DESCX, * DESCY;
35 float * X, * Y;
36#endif
37{
38/*
39* Purpose
40* =======
41*
42* PCDOTU forms the dot product of two subvectors,
43*
44* DOT := sub( X )**T * sub( Y ),
45*
46* where
47*
48* sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X,
49* X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and,
50*
51* sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y,
52* Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y.
53*
54* Notes
55* =====
56*
57* A description vector is associated with each 2D block-cyclicly dis-
58* tributed matrix. This vector stores the information required to
59* establish the mapping between a matrix entry and its corresponding
60* process and memory location.
61*
62* In the following comments, the character _ should be read as
63* "of the distributed matrix". Let A be a generic term for any 2D
64* block cyclicly distributed matrix. Its description vector is DESC_A:
65*
66* NOTATION STORED IN EXPLANATION
67* ---------------- --------------- ------------------------------------
68* DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
69* CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
70* the NPROW x NPCOL BLACS process grid
71* A is distributed over. The context
72* itself is global, but the handle
73* (the integer value) may vary.
74* M_A (global) DESCA[ M_ ] The number of rows in the distribu-
75* ted matrix A, M_A >= 0.
76* N_A (global) DESCA[ N_ ] The number of columns in the distri-
77* buted matrix A, N_A >= 0.
78* IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
79* block of the matrix A, IMB_A > 0.
80* INB_A (global) DESCA[ INB_ ] The number of columns of the upper
81* left block of the matrix A,
82* INB_A > 0.
83* MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
84* bute the last M_A-IMB_A rows of A,
85* MB_A > 0.
86* NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
87* bute the last N_A-INB_A columns of
88* A, NB_A > 0.
89* RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
90* row of the matrix A is distributed,
91* NPROW > RSRC_A >= 0.
92* CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
93* first column of A is distributed.
94* NPCOL > CSRC_A >= 0.
95* LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
96* array storing the local blocks of
97* the distributed matrix A,
98* IF( Lc( 1, N_A ) > 0 )
99* LLD_A >= MAX( 1, Lr( 1, M_A ) )
100* ELSE
101* LLD_A >= 1.
102*
103* Let K be the number of rows of a matrix A starting at the global in-
104* dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
105* that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
106* receive if these K rows were distributed over NPROW processes. If K
107* is the number of columns of a matrix A starting at the global index
108* JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
109* lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
110* these K columns were distributed over NPCOL processes.
111*
112* The values of Lr() and Lc() may be determined via a call to the func-
113* tion PB_Cnumroc:
114* Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
115* Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
116*
117* Arguments
118* =========
119*
120* N (global input) INTEGER
121* On entry, N specifies the length of the subvectors to be
122* multiplied. N must be at least zero.
123*
124* DOT (local output) COMPLEX array
125* On exit, DOT specifies the dot product of the two subvectors
126* sub( X ) and sub( Y ) only in their scope (See below for fur-
127* ther details).
128*
129* X (local input) COMPLEX array
130* On entry, X is an array of dimension (LLD_X, Kx), where LLD_X
131* is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and
132* MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least
133* Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise.
134* Before entry, this array contains the local entries of the
135* matrix X.
136*
137* IX (global input) INTEGER
138* On entry, IX specifies X's global row index, which points to
139* the beginning of the submatrix sub( X ).
140*
141* JX (global input) INTEGER
142* On entry, JX specifies X's global column index, which points
143* to the beginning of the submatrix sub( X ).
144*
145* DESCX (global and local input) INTEGER array
146* On entry, DESCX is an integer array of dimension DLEN_. This
147* is the array descriptor for the matrix X.
148*
149* INCX (global input) INTEGER
150* On entry, INCX specifies the global increment for the
151* elements of X. Only two values of INCX are supported in
152* this version, namely 1 and M_X. INCX must not be zero.
153*
154* Y (local input) COMPLEX array
155* On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y
156* is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and
157* MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least
158* Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise.
159* Before entry, this array contains the local entries of the
160* matrix Y.
161*
162* IY (global input) INTEGER
163* On entry, IY specifies Y's global row index, which points to
164* the beginning of the submatrix sub( Y ).
165*
166* JY (global input) INTEGER
167* On entry, JY specifies Y's global column index, which points
168* to the beginning of the submatrix sub( Y ).
169*
170* DESCY (global and local input) INTEGER array
171* On entry, DESCY is an integer array of dimension DLEN_. This
172* is the array descriptor for the matrix Y.
173*
174* INCY (global input) INTEGER
175* On entry, INCY specifies the global increment for the
176* elements of Y. Only two values of INCY are supported in
177* this version, namely 1 and M_Y. INCY must not be zero.
178*
179* Further Details
180* ===============
181*
182* When the result of a vector-oriented PBLAS call is a scalar, this
183* scalar is set only within the process scope which owns the vector(s)
184* being operated on. Let sub( X ) be a generic term for the input vec-
185* tor(s). Then, the processes owning the correct the answer is determi-
186* ned as follows: if an operation involves more than one vector, the
187* processes receiving the result will be the union of the following set
188* of processes for each vector:
189*
190* If N = 1, M_X = 1 and INCX = 1, then one cannot determine if a pro-
191* cess row or process column owns the vector operand, therefore only
192* the process owning sub( X ) receives the correct result;
193*
194* If INCX = M_X, then sub( X ) is a vector distributed over a process
195* row. Each process in this row receives the result;
196*
197* If INCX = 1, then sub( X ) is a vector distributed over a process
198* column. Each process in this column receives the result;
199*
200* -- Written on April 1, 1998 by
201* Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
202*
203* ---------------------------------------------------------------------
204*/
205/*
206* .. Local Scalars ..
207*/
208 char scope, * top;
209 Int OneBlock, OneDgrid, RRorCC, Square, Xcol, Xi, Xii, XinbD,
210 Xinb1D, XisD, XisR, XisRow, Xj, Xjj, Xld, Xlinc, XmyprocD,
211 XmyprocR, XnbD, XnpD, XnprocsD, XnprocsR, XprocD, XprocR,
212 Xrow, Ycol, Yi, Yii, YinbD, Yinb1D, YisD, YisR, YisRow, Yj,
213 Yjj, Yld, Ylinc, YmyprocD, YmyprocR, YnbD, YnpD, YnprocsD,
214 YnprocsR, YprocD, YprocR, Yrow, cdst, csrc, ctxt, dst, info,
215 ione=1, mycol, myrow, npcol, nprow, rdst, rsrc, size, src;
216 PBTYP_T * type;
217 VVDOT_T dot;
218/*
219* .. Local Arrays ..
220*/
221 char * buf = NULL;
222 Int Xd[DLEN_], Yd[DLEN_], dbuf[ DLEN_ ];
223/* ..
224* .. Executable Statements ..
225*
226*/
227 PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd );
228 PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd );
229#ifndef NO_ARGCHK
230/*
231* Test the input parameters
232*/
233 Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
234 if( !( info = ( ( nprow == -1 ) ? -( 601 + CTXT_ ) : 0 ) ) )
235 {
236 PB_Cchkvec( ctxt, "PCDOTU", "X", *N, 1, Xi, Xj, Xd, *INCX, 6, &info );
237 PB_Cchkvec( ctxt, "PCDOTU", "Y", *N, 1, Yi, Yj, Yd, *INCY, 11, &info );
238 }
239 if( info ) { PB_Cabort( ctxt, "PCDOTU", info ); return; }
240#endif
241 DOT[REAL_PART] = ZERO;
242 DOT[IMAG_PART] = ZERO;
243/*
244* Quick return if possible
245*/
246 if( *N == 0 ) return;
247/*
248* Handle degenerate case
249*/
250 if( ( *N == 1 ) && ( ( Xd[ M_ ] == 1 ) || ( Yd[ M_ ] == 1 ) ) )
251 {
252 type = PB_Cctypeset();
253 PB_Cpdot11( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX,
254 ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotu );
255 return;
256 }
257/*
258* Start the operations
259*/
260#ifdef NO_ARGCHK
261 Cblacs_gridinfo( ( ctxt = Xd[ CTXT_ ] ), &nprow, &npcol, &myrow, &mycol );
262#endif
263/*
264* Determine if sub( X ) is distributed or not
265*/
266 if( ( XisRow = ( *INCX == Xd[M_] ) ) != 0 )
267 XisD = ( ( Xd[CSRC_] >= 0 ) && ( ( XnprocsD = npcol ) > 1 ) );
268 else
269 XisD = ( ( Xd[RSRC_] >= 0 ) && ( ( XnprocsD = nprow ) > 1 ) );
270/*
271* Determine if sub( Y ) is distributed or not
272*/
273 if( ( YisRow = ( *INCY == Yd[M_] ) ) != 0 )
274 YisD = ( ( Yd[CSRC_] >= 0 ) && ( ( YnprocsD = npcol ) > 1 ) );
275 else
276 YisD = ( ( Yd[RSRC_] >= 0 ) && ( ( YnprocsD = nprow ) > 1 ) );
277/*
278* Are sub( X ) and sub( Y ) both row or column vectors ?
279*/
280 RRorCC = ( ( XisRow && YisRow ) || ( !( XisRow ) && !( YisRow ) ) );
281/*
282* XisD && YisD <=> both vector operands are indeed distributed
283*/
284 if( XisD && YisD )
285 {
286/*
287* Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol
288*/
289 PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj,
290 &Xrow, &Xcol );
291 if( XisRow )
292 {
293 XinbD = Xd[INB_]; XnbD = Xd[NB_];
294 Xld = Xd[LLD_]; Xlinc = Xld;
295 XprocD = Xcol; XmyprocD = mycol;
296 XprocR = Xrow; XmyprocR = myrow; XnprocsR = nprow;
297 XisR = ( ( Xrow == -1 ) || ( XnprocsR == 1 ) );
298 Mfirstnb( Xinb1D, *N, Xj, XinbD, XnbD );
299 }
300 else
301 {
302 XinbD = Xd[IMB_]; XnbD = Xd[MB_];
303 Xld = Xd[LLD_]; Xlinc = 1;
304 XprocD = Xrow; XmyprocD = myrow;
305 XprocR = Xcol; XmyprocR = mycol; XnprocsR = npcol;
306 XisR = ( ( Xcol == -1 ) || ( XnprocsR == 1 ) );
307 Mfirstnb( Xinb1D, *N, Xi, XinbD, XnbD );
308 }
309/*
310* Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol
311*/
312 PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj,
313 &Yrow, &Ycol );
314 if( YisRow )
315 {
316 YinbD = Yd[INB_]; YnbD = Yd[NB_];
317 Yld = Yd[LLD_]; Ylinc = Yld;
318 YprocD = Ycol; YmyprocD = mycol;
319 YprocR = Yrow; YmyprocR = myrow; YnprocsR = nprow;
320 YisR = ( ( Yrow == -1 ) || ( YnprocsR == 1 ) );
321 Mfirstnb( Yinb1D, *N, Yj, YinbD, YnbD );
322 }
323 else
324 {
325 YinbD = Yd[IMB_]; YnbD = Yd[MB_];
326 Yld = Yd[LLD_]; Ylinc = 1;
327 YprocD = Yrow; YmyprocD = myrow;
328 YprocR = Ycol; YmyprocR = mycol; YnprocsR = npcol;
329 YisR = ( ( Ycol == -1 ) || ( YnprocsR == 1 ) );
330 Mfirstnb( Yinb1D, *N, Yi, YinbD, YnbD );
331 }
332/*
333* Do sub( X ) and sub( Y ) span more than one process ?
334*/
335 OneDgrid = ( ( XnprocsD == 1 ) && ( YnprocsD == 1 ) );
336 OneBlock = ( ( Xinb1D >= *N ) && ( Yinb1D >= *N ) );
337/*
338* Are sub( X ) and sub( Y ) distributed in the same manner ?
339*/
340 Square = ( ( Xinb1D == Yinb1D ) && ( XnbD == YnbD ) &&
341 ( XnprocsD == YnprocsD ) );
342
343 if( !( XisR ) )
344 {
345/*
346* sub( X ) is not replicated
347*/
348 if( YisR )
349 {
350/*
351* If sub( X ) is not replicated, but sub( Y ) is, a process row or column
352* YprocR need to be selected. It will contain the non-replicated vector used
353* to perform the dot product computation.
354*/
355 if( RRorCC )
356 {
357/*
358* sub( X ) and sub( Y ) are both row or column vectors
359*/
360 if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) )
361 {
362/*
363* sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD.
364* Enforce a purely local operation by choosing YprocR to be equal to XprocR.
365*/
366 YprocR = XprocR;
367 }
368 else
369 {
370/*
371* Otherwise, communication has to occur, so choose the next process row or
372* column for YprocR to maximize the number of links, i.e reduce contention.
373*/
374 YprocR = MModAdd1( XprocR, XnprocsR );
375 }
376 }
377 else
378 {
379/*
380* sub( X ) and sub( Y ) are distributed in orthogonal directions, what is
381* chosen for YprocR does not really matter. Select the process origin.
382*/
383 YprocR = XprocD;
384 }
385 }
386 else
387 {
388/*
389* Neither sub( X ) nor sub( Y ) are replicated. If I am not in process row or
390* column XprocR and not in process row or column YprocR, then quick return.
391*/
392 if( ( XmyprocR != XprocR ) && ( YmyprocR != YprocR ) )
393 return;
394 }
395 }
396 else
397 {
398/*
399* sub( X ) is distributed and replicated (so no quick return possible)
400*/
401 if( YisR )
402 {
403/*
404* sub( Y ) is distributed and replicated as well
405*/
406 if( RRorCC )
407 {
408/*
409* sub( X ) and sub( Y ) are both row or column vectors
410*/
411 if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) )
412 {
413/*
414* sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD.
415* Enforce a purely local operation by choosing XprocR and YprocR to be equal
416* to zero.
417*/
418 XprocR = YprocR = 0;
419 }
420 else
421 {
422/*
423* Otherwise, communication has to occur, so select YprocR to be zero and the
424* next process row or column for XprocR in order to maximize the number of
425* used links, i.e reduce contention.
426*/
427 YprocR = 0;
428 XprocR = MModAdd1( YprocR, YnprocsR );
429 }
430 }
431 else
432 {
433/*
434* sub( X ) and sub( Y ) are distributed in orthogonal directions, select the
435* origin processes.
436*/
437 XprocR = YprocD;
438 YprocR = XprocD;
439 }
440 }
441 else
442 {
443/*
444* sub( Y ) is distributed, but not replicated
445*/
446 if( RRorCC )
447 {
448/*
449* sub( X ) and sub( Y ) are both row or column vectors
450*/
451 if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) )
452 {
453/*
454* sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD.
455* Enforce a purely local operation by choosing XprocR to be equal to YprocR.
456*/
457 XprocR = YprocR;
458 }
459 else
460 {
461/*
462* Otherwise, communication has to occur, so choose the next process row or
463* column for XprocR to maximize the number of links, i.e reduce contention.
464*/
465 XprocR = MModAdd1( YprocR, YnprocsR );
466 }
467 }
468 else
469 {
470/*
471* sub( X ) and sub( Y ) are distributed in orthogonal directions, what is
472* chosen for XprocR does not really matter. Select the origin process.
473*/
474 XprocR = YprocD;
475 }
476 }
477 }
478/*
479* Even if sub( X ) and/or sub( Y ) are replicated, only two process row or
480* column are active, namely XprocR and YprocR. If any of those operands is
481* replicated, broadcast will occur (unless there is an easy way out).
482*/
483 type = PB_Cctypeset(); size = type->size; dot = type->Fvvdotu;
484/*
485* A purely operation occurs iff the operands start in the same process and if
486* either the grid is mono-dimensional or there is a single local block to be
487* operated with or if both operands are aligned.
488*/
489 if( ( ( RRorCC && ( XprocD == YprocD ) && ( XprocR == YprocR ) ) ||
490 ( !( RRorCC ) && ( XprocD == YprocR ) && ( XprocR == YprocD ) ) ) &&
491 ( OneDgrid || OneBlock || ( RRorCC && Square ) ) )
492 {
493 if( ( !XisR && ( XmyprocR == XprocR ) &&
494 !YisR && ( YmyprocR == YprocR ) ) ||
495 ( !XisR && YisR && ( YmyprocR == YprocR ) ) ||
496 ( !YisR && XisR && ( XmyprocR == XprocR ) ) ||
497 ( XisR && YisR ) )
498 {
499 XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD,
500 XnprocsD );
501 YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD,
502 YnprocsD );
503 if( ( XnpD > 0 ) && ( YnpD > 0 ) )
504 {
505 dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld,
506 size ), &Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld, size ),
507 &Ylinc );
508 }
509 }
510/*
511* Combine the local results in sub( X )'s scope
512*/
513 if( ( XisR && YisR ) || ( XmyprocR == XprocR ) )
514 {
515 scope = ( XisRow ? CROW : CCOLUMN );
516 top = PB_Ctop( &ctxt, COMBINE, &scope, TOP_GET );
517 Ccgsum2d( ctxt, &scope, top, 1, 1, ((char *) DOT), 1, -1, 0 );
518 }
519 if( RRorCC && XisR && YisR ) return;
520 }
521 else if( ( RRorCC && OneDgrid ) || OneBlock || Square )
522 {
523/*
524* Otherwise, it may be possible to compute the desired dot-product in a single
525* message exchange iff the grid is mono-dimensional and the operands are
526* distributed in the same direction, or there is just one block to be exchanged
527* or if both operands are similarly distributed in their respective direction.
528*/
529 if( ( YmyprocR == YprocR ) )
530 {
531/*
532* The processes owning a piece of sub( Y ) send it to the corresponding
533* process owning s piece of sub ( X ).
534*/
535 YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD,
536 YnprocsD );
537 if( YnpD > 0 )
538 {
539 dst = XprocD + MModSub( YmyprocD, YprocD, YnprocsD );
540 dst = MPosMod( dst, XnprocsD );
541 if( XisRow ) { rdst = XprocR; cdst = dst; }
542 else { rdst = dst; cdst = XprocR; }
543
544 if( ( myrow == rdst ) && ( mycol == cdst ) )
545 {
546 dot( &YnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld,
547 size ), &Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld,
548 size ), &Ylinc );
549 }
550 else
551 {
552 if( YisRow )
553 Ccgesd2d( ctxt, 1, YnpD, Mptr( ((char *) Y), Yii, Yjj,
554 Yld, size ), Yld, rdst, cdst );
555 else
556 Ccgesd2d( ctxt, YnpD, 1, Mptr( ((char *) Y), Yii, Yjj,
557 Yld, size ), Yld, rdst, cdst );
558 }
559 }
560 }
561 if( XmyprocR == XprocR )
562 {
563/*
564* The processes owning a piece of sub( X ) receive the corresponding local
565* piece of sub( Y ), compute the local dot product and combine the results
566* within sub( X )'s scope.
567*/
568 XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD,
569 XnprocsD );
570 if( XnpD > 0 )
571 {
572 src = YprocD + MModSub( XmyprocD, XprocD, XnprocsD );
573 src = MPosMod( src, YnprocsD );
574 if( YisRow ) { rsrc = YprocR; csrc = src; }
575 else { rsrc = src; csrc = YprocR; }
576 if( ( myrow != rsrc ) || ( mycol != csrc ) )
577 {
578 buf = PB_Cmalloc( XnpD * size );
579 if( YisRow )
580 Ccgerv2d( ctxt, 1, XnpD, buf, 1, rsrc, csrc );
581 else
582 Ccgerv2d( ctxt, XnpD, 1, buf, XnpD, rsrc, csrc );
583 dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld,
584 size ), &Xlinc, buf, &ione );
585 if( buf ) free( buf );
586 }
587 }
588 if( XisRow )
589 {
590 top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
591 Ccgsum2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, -1, 0 );
592 }
593 else
594 {
595 top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
596 Ccgsum2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, -1, 0 );
597 }
598 }
599 }
600 else
601 {
602/*
603* General case, copy sub( Y ) within sub( X )'s scope, compute the local
604* results and combine them within sub( X )'s scope.
605*/
606 XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD );
607
608 if( XisRow )
609 {
610 PB_Cdescset( dbuf, 1, *N, 1, Xinb1D, 1, XnbD, XprocR, XprocD, ctxt,
611 1 );
612 }
613 else
614 {
615 PB_Cdescset( dbuf, *N, 1, Xinb1D, 1, XnbD, 1, XprocD, XprocR, ctxt,
616 MAX( 1, XnpD ) );
617 }
618 if( ( XmyprocR == XprocR ) && ( XnpD > 0 ) )
619 buf = PB_Cmalloc( XnpD * size );
620
621 if( YisRow )
622 {
623 PB_Cpaxpby( type, NOCONJG, 1, *N, type->one, ((char *) Y), Yi, Yj,
624 Yd, ROW, type->zero, buf, 0, 0, dbuf, ( XisRow ? ROW :
625 COLUMN ) );
626 }
627 else
628 {
629 PB_Cpaxpby( type, NOCONJG, *N, 1, type->one, ((char *) Y), Yi, Yj,
630 Yd, COLUMN, type->zero, buf, 0, 0, dbuf, ( XisRow ?
631 ROW : COLUMN ) );
632 }
633
634 if( XmyprocR == XprocR )
635 {
636 if( XnpD > 0 )
637 {
638 dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld,
639 size ), &Xlinc, buf, &ione );
640 if( buf ) free( buf );
641 }
642 if( XisRow )
643 {
644 top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
645 Ccgsum2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, -1, 0 );
646 }
647 else
648 {
649 top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
650 Ccgsum2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, -1, 0 );
651 }
652 }
653 }
654/*
655* Send the DOT product result within sub( Y )'s scope
656*/
657 if( XisR || YisR )
658 {
659/*
660* Either sub( X ) or sub( Y ) are replicated, so that every process should have
661* the result -> broadcast it orthogonally from sub( X )'s direction.
662*/
663 if( XisRow )
664 {
665 top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
666 if( XmyprocR == XprocR )
667 Ccgebs2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1 );
668 else
669 Ccgebr2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, XprocR,
670 XmyprocD );
671 }
672 else
673 {
674 top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
675 if( XmyprocR == XprocR )
676 Ccgebs2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1 );
677 else
678 Ccgebr2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, XmyprocD,
679 XprocR );
680 }
681 }
682 else
683 {
684/*
685* Neither sub( X ) nor sub( Y ) are replicated
686*/
687 if( RRorCC )
688 {
689/*
690* Both sub( X ) are distributed in the same direction -> the process row or
691* column XprocR sends the result to the process row or column YprocR.
692*/
693 if( XprocR != YprocR )
694 {
695 if( XmyprocR == XprocR )
696 {
697 if( XisRow )
698 Ccgesd2d( ctxt, 1, 1, ((char *) DOT), 1, YprocR,
699 YmyprocD );
700 else
701 Ccgesd2d( ctxt, 1, 1, ((char *) DOT), 1, YmyprocD,
702 YprocR );
703 }
704 else if( YmyprocR == YprocR )
705 {
706 if( XisRow )
707 Ccgerv2d( ctxt, 1, 1, ((char *) DOT), 1, XprocR,
708 XmyprocD );
709 else
710 Ccgerv2d( ctxt, 1, 1, ((char *) DOT), 1, XmyprocD,
711 XprocR );
712 }
713 }
714 }
715 else
716 {
717/*
718* Otherwise, the process at the intersection of sub( X )'s and sub( Y )'s
719* scope, broadcast the result within sub( Y )'s scope.
720*/
721 if( YmyprocR == YprocR )
722 {
723 if( YisRow )
724 {
725 top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
726 if( YmyprocD == XprocR )
727 Ccgebs2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1 );
728 else
729 Ccgebr2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1,
730 YprocR, XprocR );
731 }
732 else
733 {
734 top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
735 if( YmyprocD == XprocR )
736 Ccgebs2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1 );
737 else
738 Ccgebr2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1,
739 XprocR, YprocR );
740 }
741 }
742 }
743 }
744 }
745 else if( !( XisD ) && YisD )
746 {
747/*
748* sub( X ) is not distributed and sub( Y ) is distributed.
749*/
750 type = PB_Cctypeset();
751 PB_CpdotND( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX,
752 ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotu );
753 }
754 else if( XisD && !( YisD ) )
755 {
756/*
757* sub( X ) is distributed and sub( Y ) is not distributed.
758*/
759 type = PB_Cctypeset();
760 PB_CpdotND( type, *N, ((char *) DOT), ((char *) Y), Yi, Yj, Yd, *INCY,
761 ((char *) X), Xi, Xj, Xd, *INCX, type->Fvvdotu );
762 }
763 else
764 {
765/*
766* Neither sub( X ) nor sub( Y ) are distributed
767*/
768 type = PB_Cctypeset();
769 PB_CpdotNN( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX,
770 ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotu );
771 }
772/*
773* End of PCDOTU
774*/
775}
#define Int
Definition Bconfig.h:22
#define REAL_PART
Definition pblas.h:139
F_VOID_FCT(* VVDOT_T)()
Definition pblas.h:290
#define IMAG_PART
Definition pblas.h:140
#define CCOLUMN
Definition PBblacs.h:20
void Ccgesd2d()
#define TOP_GET
Definition PBblacs.h:50
#define COLUMN
Definition PBblacs.h:45
#define COMBINE
Definition PBblacs.h:49
#define CROW
Definition PBblacs.h:21
void Ccgebr2d()
void Ccgerv2d()
#define ROW
Definition PBblacs.h:46
void Cblacs_gridinfo()
#define BCAST
Definition PBblacs.h:48
void Ccgebs2d()
void Ccgsum2d()
#define NOCONJG
Definition PBblas.h:45
#define pcdotu_
Definition PBpblas.h:84
#define CTXT_
Definition PBtools.h:38
#define MAX(a_, b_)
Definition PBtools.h:77
#define MB_
Definition PBtools.h:43
void PB_Cabort()
void PB_Cchkvec()
char * PB_Cmalloc()
void PB_Cinfog2l()
#define MModSub(I1, I2, d)
Definition PBtools.h:102
void PB_Cpdot11()
#define Mptr(a_, i_, j_, lda_, siz_)
Definition PBtools.h:132
#define LLD_
Definition PBtools.h:47
Int PB_Cnumroc()
char * PB_Ctop()
void PB_CpdotNN()
#define RSRC_
Definition PBtools.h:45
void PB_Cdescset()
#define MModAdd1(I, d)
Definition PBtools.h:100
#define Mfirstnb(inbt_, n_, i_, inb_, nb_)
Definition PBtools.h:139
#define M_
Definition PBtools.h:39
#define INB_
Definition PBtools.h:42
#define MPosMod(I, d)
Definition PBtools.h:95
void PB_CpdotND()
void PB_CargFtoC()
#define CSRC_
Definition PBtools.h:46
PBTYP_T * PB_Cctypeset()
#define IMB_
Definition PBtools.h:41
#define ZERO
Definition PBtools.h:66
#define DLEN_
Definition PBtools.h:48
#define NB_
Definition PBtools.h:44
void PB_Cpaxpby()
VVDOT_T Fvvdotu
Definition pblas.h:357
Int size
Definition pblas.h:333
char * zero
Definition pblas.h:335
char * one
Definition pblas.h:336