SCALAPACK 2.2.2
LAPACK: Linear Algebra PACKage
Loading...
Searching...
No Matches
PB_CpswapND.c
Go to the documentation of this file.
1/* ---------------------------------------------------------------------
2*
3* -- PBLAS auxiliary routine (version 2.0) --
4* University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5* and University of California, Berkeley.
6* April 1, 1998
7*
8* ---------------------------------------------------------------------
9*/
10/*
11* Include files
12*/
13#include "../pblas.h"
14#include "../PBpblas.h"
15#include "../PBtools.h"
16#include "../PBblacs.h"
17#include "../PBblas.h"
18
19#ifdef __STDC__
20void PB_CpswapND( PBTYP_T * TYPE, Int N,
21 char * X, Int IX, Int JX, Int * DESCX, Int INCX,
22 char * Y, Int IY, Int JY, Int * DESCY, Int INCY )
23#else
24void PB_CpswapND( TYPE, N, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY )
25/*
26* .. Scalar Arguments ..
27*/
28 Int INCX, INCY, IX, IY, JX, JY, N;
29 PBTYP_T * TYPE;
30/*
31* .. Array Arguments ..
32*/
33 Int * DESCX, * DESCY;
34 char * X, * Y;
35#endif
36{
37/*
38* Purpose
39* =======
40*
41* PB_CpswapND swaps two subvectors,
42*
43* sub( Y ) := sub( X ) and sub( X ) := sub( Y )
44*
45* where sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X,
46* X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X,
47*
48* sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y,
49* Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y.
50*
51* sub( X ) is assumed to be not distributed, and sub( Y ) is assumed to
52* be distributed.
53*
54* Notes
55* =====
56*
57* A description vector is associated with each 2D block-cyclicly dis-
58* tributed matrix. This vector stores the information required to
59* establish the mapping between a matrix entry and its corresponding
60* process and memory location.
61*
62* In the following comments, the character _ should be read as
63* "of the distributed matrix". Let A be a generic term for any 2D
64* block cyclicly distributed matrix. Its description vector is DESC_A:
65*
66* NOTATION STORED IN EXPLANATION
67* ---------------- --------------- ------------------------------------
68* DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
69* CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
70* the NPROW x NPCOL BLACS process grid
71* A is distributed over. The context
72* itself is global, but the handle
73* (the integer value) may vary.
74* M_A (global) DESCA[ M_ ] The number of rows in the distribu-
75* ted matrix A, M_A >= 0.
76* N_A (global) DESCA[ N_ ] The number of columns in the distri-
77* buted matrix A, N_A >= 0.
78* IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
79* block of the matrix A, IMB_A > 0.
80* INB_A (global) DESCA[ INB_ ] The number of columns of the upper
81* left block of the matrix A,
82* INB_A > 0.
83* MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
84* bute the last M_A-IMB_A rows of A,
85* MB_A > 0.
86* NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
87* bute the last N_A-INB_A columns of
88* A, NB_A > 0.
89* RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
90* row of the matrix A is distributed,
91* NPROW > RSRC_A >= 0.
92* CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
93* first column of A is distributed.
94* NPCOL > CSRC_A >= 0.
95* LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
96* array storing the local blocks of
97* the distributed matrix A,
98* IF( Lc( 1, N_A ) > 0 )
99* LLD_A >= MAX( 1, Lr( 1, M_A ) )
100* ELSE
101* LLD_A >= 1.
102*
103* Let K be the number of rows of a matrix A starting at the global in-
104* dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
105* that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
106* receive if these K rows were distributed over NPROW processes. If K
107* is the number of columns of a matrix A starting at the global index
108* JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
109* lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
110* these K columns were distributed over NPCOL processes.
111*
112* The values of Lr() and Lc() may be determined via a call to the func-
113* tion PB_Cnumroc:
114* Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
115* Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
116*
117* Arguments
118* =========
119*
120* TYPE (local input) pointer to a PBTYP_T structure
121* On entry, TYPE is a pointer to a structure of type PBTYP_T,
122* that contains type information (See pblas.h).
123*
124* N (global input) INTEGER
125* On entry, N specifies the length of the subvectors to be
126* swapped. N must be at least zero.
127*
128* X (local input/local output) pointer to CHAR
129* On entry, X is an array of dimension (LLD_X, Kx), where LLD_X
130* is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and
131* MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least
132* Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise.
133* Before entry, this array contains the local entries of the
134* matrix X. On exit, sub( X ) is overwritten with sub( Y ).
135*
136* IX (global input) INTEGER
137* On entry, IX specifies X's global row index, which points to
138* the beginning of the submatrix sub( X ).
139*
140* JX (global input) INTEGER
141* On entry, JX specifies X's global column index, which points
142* to the beginning of the submatrix sub( X ).
143*
144* DESCX (global and local input) INTEGER array
145* On entry, DESCX is an integer array of dimension DLEN_. This
146* is the array descriptor for the matrix X.
147*
148* INCX (global input) INTEGER
149* On entry, INCX specifies the global increment for the
150* elements of X. Only two values of INCX are supported in
151* this version, namely 1 and M_X. INCX must not be zero.
152*
153* Y (local input/local output) pointer to CHAR
154* On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y
155* is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and
156* MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least
157* Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise.
158* Before entry, this array contains the local entries of the
159* matrix Y. On exit, sub( Y ) is overwritten with sub( X ).
160*
161* IY (global input) INTEGER
162* On entry, IY specifies Y's global row index, which points to
163* the beginning of the submatrix sub( Y ).
164*
165* JY (global input) INTEGER
166* On entry, JY specifies Y's global column index, which points
167* to the beginning of the submatrix sub( Y ).
168*
169* DESCY (global and local input) INTEGER array
170* On entry, DESCY is an integer array of dimension DLEN_. This
171* is the array descriptor for the matrix Y.
172*
173* INCY (global input) INTEGER
174* On entry, INCY specifies the global increment for the
175* elements of Y. Only two values of INCY are supported in
176* this version, namely 1 and M_Y. INCY must not be zero.
177*
178* -- Written on April 1, 1998 by
179* Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
180*
181* ---------------------------------------------------------------------
182*/
183/*
184* .. Local Scalars ..
185*/
186 char scope, * top, * zero;
187 Int RRorCC, Xcol, Xii, XisR, XisRow, Xjj, Xld, Xlinc, Xm,
188 XmyprocD, XmyprocR, Xn, XnprocsD, XnprocsR, XprocR, Xroc,
189 Xrow, Ycol, Yii, Yinb1D, YisR, YisRow, Yjj, Yld, Ylinc,
190 YmyprocD, YmyprocR, YnbD, YnpD, YnprocsD, YnprocsR, YprocD,
191 YprocR, Yroc, Yrow, ctxt, ione=1, k, kbb, kk, kn, ktmp, mycol,
192 mydist, myproc, myrow, npcol, nprow, p, size;
193/*
194* .. Local Arrays ..
195*/
196 char * buf = NULL;
197/* ..
198* .. Executable Statements ..
199*
200*/
201/*
202* Retrieve process grid information
203*/
204 Cblacs_gridinfo( ( ctxt = DESCX[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
205/*
206* Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol ...
207*/
208 PB_Cinfog2l( IX, JX, DESCX, nprow, npcol, myrow, mycol, &Xii, &Xjj,
209 &Xrow, &Xcol );
210 if( ( XisRow = ( INCX == DESCX[M_] ) ) != 0 )
211 {
212 Xld = DESCX[LLD_]; Xlinc = Xld;
213 XmyprocD = mycol; XnprocsD = npcol;
214 XprocR = Xrow; XmyprocR = myrow; XnprocsR = nprow;
215 XisR = ( ( Xrow == -1 ) || ( XnprocsR == 1 ) );
216 }
217 else
218 {
219 Xld = DESCX[LLD_]; Xlinc = 1;
220 XmyprocD = myrow; XnprocsD = nprow;
221 XprocR = Xcol; XmyprocR = mycol; XnprocsR = npcol;
222 XisR = ( ( Xcol == -1 ) || ( XnprocsR == 1 ) );
223 }
224/*
225* Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol ...
226*/
227 PB_Cinfog2l( IY, JY, DESCY, nprow, npcol, myrow, mycol, &Yii, &Yjj,
228 &Yrow, &Ycol );
229 if( ( YisRow = ( INCY == DESCY[M_] ) ) != 0 )
230 {
231 YnbD = DESCY[NB_]; Yld = DESCY[LLD_]; Ylinc = Yld;
232 YprocR = Yrow; YmyprocR = myrow; YnprocsR = nprow;
233 YprocD = Ycol; YmyprocD = mycol; YnprocsD = npcol;
234 Yinb1D = PB_Cfirstnb( N, JY, DESCY[INB_], YnbD );
235 }
236 else
237 {
238 YnbD = DESCY[MB_]; Yld = DESCY[LLD_]; Ylinc = 1;
239 YprocR = Ycol; YmyprocR = mycol; YnprocsR = npcol;
240 YprocD = Yrow; YmyprocD = myrow; YnprocsD = nprow;
241 Yinb1D = PB_Cfirstnb( N, IY, DESCY[IMB_], YnbD );
242 }
243
244 YisR = ( ( YprocR == -1 ) || ( YnprocsR == 1 ) );
245/*
246* Are sub( X ) and sub( Y ) both row or column vectors ?
247*/
248 RRorCC = ( ( XisRow && YisRow ) || ( !( XisRow ) && !( YisRow ) ) );
249/*
250* sub( X ) is not distributed and sub( Y ) is distributed
251*/
252 size = TYPE->size; zero = TYPE->zero;
253
254 if( !( XisR ) )
255 {
256/*
257* sub( X ) is not replicated. Since this operation is local if sub( X ) and
258* sub( Y ) are both row or column vectors, choose YprocR = XprocR when RRorCC,
259* and YprocR = 0 otherwise.
260*/
261 if( YisR ) { YprocR = ( ( RRorCC ) ? XprocR : 0 ); }
262/*
263* Now, it is just like sub( Y ) is not replicated, this information however is
264* kept in YisR for later use.
265*/
266 if( ( XmyprocR == XprocR ) || ( YmyprocR == YprocR ) )
267 {
268/*
269* sub( X ) and sub( Y ) are both row or column vectors
270*/
271 if( RRorCC )
272 {
273 YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD );
274/*
275* sub( X ) and sub( Y ) are in the same process row or column
276*/
277 if( XprocR == YprocR )
278 {
279/*
280* In a given process, sub( Y ) is swapped with the corresponding non distribu-
281* ted part of sub( X ). In the other processes, this part of sub( X ) is set
282* to zero for later reduction.
283*/
284 if( YnpD > 0 )
285 {
286 Yroc = YprocD;
287 if( XisRow ) { kk = Yjj; ktmp = JX + N; kn = JX + Yinb1D; }
288 else { kk = Yii; ktmp = IX + N; kn = IX + Yinb1D; }
289
290 if( YmyprocD == Yroc )
291 {
292 TYPE->Fswap( &Yinb1D, Mptr( X, Xii, Xjj, Xld, size ),
293 &Xlinc, Mptr( Y, Yii, Yjj, Yld, size ),
294 &Ylinc );
295 kk += Yinb1D;
296 }
297 else
298 {
299 TYPE->Fset( &Yinb1D, zero, Mptr( X, Xii, Xjj, Xld, size ),
300 &Xlinc );
301 }
302 Yroc = MModAdd1( Yroc, YnprocsD );
303
304 for( k = kn; k < ktmp; k += YnbD )
305 {
306 kbb = ktmp - k; kbb = MIN( kbb, YnbD );
307 if( YmyprocD == Yroc )
308 {
309 if( XisRow )
310 TYPE->Fswap( &kbb, Mptr( X, Xii, k, Xld, size ),
311 &Xlinc, Mptr( Y, Yii, kk, Yld, size ),
312 &Ylinc );
313 else
314 TYPE->Fswap( &kbb, Mptr( X, k, Xjj, Xld, size ),
315 &Xlinc, Mptr( Y, kk, Yjj, Yld, size ),
316 &Ylinc );
317 kk += kbb;
318 }
319 else
320 {
321 if( XisRow )
322 TYPE->Fset( &kbb, zero, Mptr( X, Xii, k, Xld, size ),
323 &Xlinc );
324 else
325 TYPE->Fset( &kbb, zero, Mptr( X, k, Xjj, Xld, size ),
326 &Xlinc );
327 }
328 Yroc = MModAdd1( Yroc, YnprocsD );
329 }
330 }
331 else
332 {
333/*
334* If I don't own any entries of sub( Y ), then zero the entire sub( X )
335* residing in this process.
336*/
337 TYPE->Fset( &N, zero, Mptr( X, Xii, Xjj, Xld, size ),
338 &Xlinc );
339 }
340/*
341* Replicate locally scattered sub( X ) by reducing it
342*/
343 if( XisRow )
344 {
345 top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
346 TYPE->Cgsum2d( ctxt, ROW, top, 1, N, Mptr( X, Xii, Xjj, Xld,
347 size ), Xld, -1, 0 );
348 }
349 else
350 {
351 top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
352 TYPE->Cgsum2d( ctxt, COLUMN, top, N, 1, Mptr( X, Xii, Xjj,
353 Xld, size ), Xld, -1, 0 );
354 }
355 }
356 else
357 {
358/*
359* sub( X ) and sub( Y ) are in a different process row or column
360*/
361 if( YmyprocR == YprocR )
362 {
363/*
364* If I own a piece of sub( Y ), then send it to the process row or column where
365* sub( X ) resides and receive back the sub( X ) data from the same process.
366*/
367 if( YnpD > 0 )
368 {
369 if( YisRow )
370 {
371 TYPE->Cgesd2d( ctxt, 1, YnpD, Mptr( Y, Yii, Yjj, Yld,
372 size ), Yld, XprocR, YmyprocD );
373 TYPE->Cgerv2d( ctxt, 1, YnpD, Mptr( Y, Yii, Yjj, Yld,
374 size ), Yld, XprocR, YmyprocD );
375 }
376 else
377 {
378 TYPE->Cgesd2d( ctxt, YnpD, 1, Mptr( Y, Yii, Yjj, Yld,
379 size ), Yld, YmyprocD, XprocR );
380 TYPE->Cgerv2d( ctxt, YnpD, 1, Mptr( Y, Yii, Yjj, Yld,
381 size ), Yld, YmyprocD, XprocR );
382 }
383 }
384 }
385
386 if( XmyprocR == XprocR )
387 {
388/*
389* If I own a sub( X ), then receive the distributed part of sub( Y ) owned by
390* the process where sub( Y ) resides in my row or column. Perform a local swap
391* as if sub( Y ) would reside in the same process row or column as sub( X ).
392* Send the result back and finally perform the reduction to replicate sub( X ).
393*/
394 if( YnpD > 0 )
395 {
396 buf = PB_Cmalloc( YnpD * size );
397 if( YisRow )
398 TYPE->Cgerv2d( ctxt, 1, YnpD, buf, 1, YprocR,
399 XmyprocD );
400 else
401 TYPE->Cgerv2d( ctxt, YnpD, 1, buf, YnpD, XmyprocD,
402 YprocR );
403
404 Yroc = YprocD;
405 kk = 0;
406 if( XisRow ) { ktmp = JX + N; kn = JX + Yinb1D; }
407 else { ktmp = IX + N; kn = IX + Yinb1D; }
408
409 if( YmyprocD == Yroc )
410 {
411 TYPE->Fswap( &Yinb1D, Mptr( X, Xii, Xjj, Xld, size ),
412 &Xlinc, buf, &ione );
413 kk += Yinb1D;
414 }
415 else
416 {
417 TYPE->Fset( &Yinb1D, zero, Mptr( X, Xii, Xjj, Xld,
418 size ), &Xlinc );
419 }
420 Yroc = MModAdd1( Yroc, YnprocsD );
421
422 for( k = kn; k < ktmp; k += YnbD )
423 {
424 kbb = ktmp - k; kbb = MIN( kbb, YnbD );
425
426 if( YmyprocD == Yroc )
427 {
428 if( XisRow )
429 TYPE->Fswap( &kbb, Mptr( X, Xii, k, Xld, size ),
430 &Xlinc, buf+kk*size, &ione );
431 else
432 TYPE->Fswap( &kbb, Mptr( X, k, Xjj, Xld, size ),
433 &Xlinc, buf+kk*size, &ione );
434 kk += kbb;
435 }
436 else
437 {
438 if( XisRow )
439 TYPE->Fset( &kbb, zero, Mptr( X, Xii, k, Xld,
440 size ), &Xlinc );
441 else
442 TYPE->Fset( &kbb, zero, Mptr( X, k, Xjj, Xld,
443 size ), &Xlinc );
444 }
445 Yroc = MModAdd1( Yroc, YnprocsD );
446 }
447 if( YisRow )
448 TYPE->Cgesd2d( ctxt, 1, YnpD, buf, 1, YprocR,
449 XmyprocD );
450 else
451 TYPE->Cgesd2d( ctxt, YnpD, 1, buf, YnpD, XmyprocD,
452 YprocR );
453 if( buf ) free( buf );
454 }
455 else
456 {
457 TYPE->Fset( &N, zero, Mptr( X, Xii, Xjj, Xld, size ),
458 &Xlinc );
459 }
460/*
461* Replicate locally scattered sub( X ) by reducing it
462*/
463 if( XisRow )
464 {
465 top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
466 TYPE->Cgsum2d( ctxt, ROW, top, 1, N, Mptr( X, Xii, Xjj,
467 Xld, size ), Xld, -1, 0 );
468 }
469 else
470 {
471 top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
472 TYPE->Cgsum2d( ctxt, COLUMN, top, N, 1, Mptr( X, Xii, Xjj,
473 Xld, size ), Xld, -1, 0 );
474 }
475 }
476 }
477 }
478 else
479 {
480/*
481* sub( X ) and sub( Y ) are not both row or column vectors
482*/
483 Xroc = 0;
484 if( XisRow ) { ktmp = JX + N; kn = JX + Yinb1D; }
485 else { ktmp = IX + N; kn = IX + Yinb1D; }
486/*
487* Loop over the processes in which sub( Y ) resides, for each process find the
488* next process Xroc and swap the data. After this, it will be needed to reduce
489* sub( X ) as above.
490*/
491 for( p = 0; p < YnprocsD; p++ )
492 {
493 mydist = MModSub( p, YprocD, YnprocsD );
494 myproc = MModAdd( YprocD, mydist, YnprocsD );
495
496 if( ( XprocR == p ) && ( YprocR == Xroc ) )
497 {
498/*
499* Swap locally at the intersection of the process cross
500*/
501 if( XmyprocR == p )
502 {
503 YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, p, YprocD,
504 YnprocsD );
505 if( YnpD > 0 )
506 {
507 Yroc = YprocD;
508 kk = ( XisRow ? Yii : Yjj );
509
510 if( myproc == Yroc )
511 {
512 if( XmyprocD == Xroc )
513 {
514 TYPE->Fswap( &Yinb1D, Mptr( X, Xii, Xjj, Xld,
515 size ), &Xlinc, Mptr( Y, Yii, Yjj,
516 Yld, size ), &Ylinc );
517 kk += Yinb1D;
518 }
519 else
520 {
521 TYPE->Fset( &Yinb1D, zero, Mptr( X, Xii, Xjj, Xld,
522 size ), &Xlinc );
523 }
524 }
525 Yroc = MModAdd1( Yroc, YnprocsD );
526
527 for( k = kn; k < ktmp; k += YnbD )
528 {
529 kbb = ktmp - k; kbb = MIN( kbb, YnbD );
530 if( myproc == Yroc )
531 {
532 if( XmyprocD == Xroc )
533 {
534 if( XisRow )
535 TYPE->Fswap( &kbb, Mptr( X, Xii, k, Xld,
536 size ), &Xlinc, Mptr( Y, kk,
537 Yjj, Yld, size ), &Ylinc );
538 else
539 TYPE->Fswap( &kbb, Mptr( X, k, Xjj, Xld,
540 size ), &Xlinc, Mptr( Y, Yii,
541 kk, Yld, size ), &Ylinc );
542 kk += kbb;
543 }
544 else
545 {
546 if( XisRow )
547 TYPE->Fset( &kbb, zero, Mptr( X, Xii, k,
548 Xld, size ), &Xlinc );
549 else
550 TYPE->Fset( &kbb, zero, Mptr( X, k, Xjj,
551 Xld, size ), &Xlinc );
552 }
553 }
554 Yroc = MModAdd1( Yroc, YnprocsD );
555 }
556 }
557 }
558 }
559 else
560 {
561/*
562* Message exchange
563*/
564 if( ( YmyprocR == YprocR ) && ( YmyprocD == p ) )
565 {
566 YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, p, YprocD,
567 YnprocsD );
568 if( YnpD > 0 )
569 {
570 if( XisRow )
571 {
572 TYPE->Cgesd2d( ctxt, YnpD, 1, Mptr( Y, Yii, Yjj,
573 Yld, size ), Yld, XprocR, Xroc );
574 TYPE->Cgerv2d( ctxt, YnpD, 1, Mptr( Y, Yii, Yjj,
575 Yld, size ), Yld, XprocR, Xroc );
576 }
577 else
578 {
579 TYPE->Cgesd2d( ctxt, 1, YnpD, Mptr( Y, Yii, Yjj,
580 Yld, size ), Yld, Xroc, XprocR );
581 TYPE->Cgerv2d( ctxt, 1, YnpD, Mptr( Y, Yii, Yjj,
582 Yld, size ), Yld, Xroc, XprocR );
583 }
584 }
585 }
586
587 if( XmyprocR == XprocR )
588 {
589 YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, p, YprocD,
590 YnprocsD );
591 if( YnpD > 0 )
592 {
593 Yroc = YprocD;
594 kk = 0;
595/*
596* Receive the piece of sub( Y ) that I should handle
597*/
598 if( XmyprocD == Xroc )
599 {
600 buf = PB_Cmalloc( YnpD * size );
601 if( XisRow )
602 TYPE->Cgerv2d( ctxt, YnpD, 1, buf, YnpD,
603 p, YprocR );
604 else
605 TYPE->Cgerv2d( ctxt, 1, YnpD, buf, 1,
606 YprocR, p );
607 }
608
609 if( myproc == Yroc )
610 {
611 if( XmyprocD == Xroc )
612 {
613 TYPE->Fswap( &Yinb1D, Mptr( X, Xii, Xjj, Xld,
614 size ), &Xlinc, buf, &ione );
615 kk += Yinb1D;
616 }
617 else
618 {
619 TYPE->Fset( &Yinb1D, zero, Mptr( X, Xii, Xjj,
620 Xld, size ), &Xlinc );
621 }
622 }
623 Yroc = MModAdd1( Yroc, YnprocsD );
624
625 for( k = kn; k < ktmp; k += YnbD )
626 {
627 kbb = ktmp - k; kbb = MIN( kbb, YnbD );
628 if( myproc == Yroc )
629 {
630 if( XmyprocD == Xroc )
631 {
632 if( XisRow )
633 TYPE->Fswap( &kbb, Mptr( X, Xii, k, Xld,
634 size ), &Xlinc, buf+kk*size,
635 &ione );
636 else
637 TYPE->Fswap( &kbb, Mptr( X, k, Xjj, Xld,
638 size ), &Xlinc, buf+kk*size,
639 &ione );
640 kk += kbb;
641 }
642 else
643 {
644 if( XisRow )
645 TYPE->Fset( &kbb, zero, Mptr( X, Xii, k,
646 Xld, size ), &Xlinc );
647 else
648 TYPE->Fset( &kbb, zero, Mptr( X, k, Xjj,
649 Xld, size ), &Xlinc );
650 }
651 }
652 Yroc = MModAdd1( Yroc, YnprocsD );
653 }
654
655 if( XmyprocD == Xroc )
656 {
657 if( XisRow )
658 TYPE->Cgesd2d( ctxt, YnpD, 1, buf, YnpD,
659 p, YprocR );
660 else
661 TYPE->Cgesd2d( ctxt, 1, YnpD, buf, 1,
662 YprocR, p );
663 if( buf ) free( buf );
664 }
665 }
666 }
667 }
668 Xroc = MModAdd1( Xroc, XnprocsD );
669 }
670/*
671* Replicate locally scattered sub( X ) by reducing it
672*/
673 if( XmyprocR == XprocR )
674 {
675 if( XisRow )
676 {
677 top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
678 TYPE->Cgsum2d( ctxt, ROW, top, 1, N, Mptr( X, Xii, Xjj,
679 Xld, size ), Xld, -1, 0 );
680 }
681 else
682 {
683 top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
684 TYPE->Cgsum2d( ctxt, COLUMN, top, N, 1, Mptr( X, Xii, Xjj,
685 Xld, size ), Xld, -1, 0 );
686 }
687 }
688 }
689 }
690
691 if( YisR )
692 {
693/*
694* Replicate sub( Y )
695*/
696 YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD );
697 if( YnpD > 0 )
698 {
699 if( YisRow )
700 {
701 top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
702 if( YmyprocR == YprocR )
703 TYPE->Cgebs2d( ctxt, COLUMN, top, 1, YnpD, Mptr( Y, Yii, Yjj,
704 Yld, size ), Yld );
705 else
706 TYPE->Cgebr2d( ctxt, COLUMN, top, 1, YnpD, Mptr( Y, Yii, Yjj,
707 Yld, size ), Yld, YprocR, YmyprocD );
708 }
709 else
710 {
711 top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
712 if( YmyprocR == YprocR )
713 TYPE->Cgebs2d( ctxt, ROW, top, YnpD, 1, Mptr( Y, Yii, Yjj,
714 Yld, size ), Yld );
715 else
716 TYPE->Cgebr2d( ctxt, ROW, top, YnpD, 1, Mptr( Y, Yii, Yjj,
717 Yld, size ), Yld, YmyprocD, YprocR );
718 }
719 }
720 }
721 }
722 else
723 {
724/*
725* sub( X ) is replicated in every process. Swap the data in process row or
726* column YprocR when sub( Y ) is not replicated and in every process otherwise.
727*/
728 if( YisR || ( YmyprocR == YprocR ) )
729 {
730 YnpD = PB_Cnumroc( N, 0, Yinb1D, YnbD, YmyprocD, YprocD, YnprocsD );
731
732 if( YnpD > 0 )
733 {
734 Yroc = YprocD;
735 kk = ( YisRow ? Yjj : Yii );
736
737 if( XisRow ) { ktmp = JX + N; kn = JX + Yinb1D; }
738 else { ktmp = IX + N; kn = IX + Yinb1D; }
739
740 if( YmyprocD == Yroc )
741 {
742 TYPE->Fswap( &Yinb1D, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc,
743 Mptr( Y, Yii, Yjj, Yld, size ), &Ylinc );
744 kk += Yinb1D;
745 }
746 else
747 {
748 TYPE->Fset( &Yinb1D, zero, Mptr( X, Xii, Xjj, Xld, size ),
749 &Xlinc );
750 }
751 Yroc = MModAdd1( Yroc, YnprocsD );
752
753 for( k = kn; k < ktmp; k += YnbD )
754 {
755 kbb = ktmp - k; kbb = MIN( kbb, YnbD );
756 if( YmyprocD == Yroc )
757 {
758 if( YisRow )
759 {
760 if( XisRow )
761 TYPE->Fswap( &kbb, Mptr( X, Xii, k, Xld, size ), &Xlinc,
762 Mptr( Y, Yii, kk, Yld, size ), &Ylinc );
763 else
764 TYPE->Fswap( &kbb, Mptr( X, k, Xjj, Xld, size ), &Xlinc,
765 Mptr( Y, Yii, kk, Yld, size ), &Ylinc );
766 }
767 else
768 {
769 if( XisRow )
770 TYPE->Fswap( &kbb, Mptr( X, Xii, k, Xld, size ), &Xlinc,
771 Mptr( Y, kk, Yjj, Yld, size ), &Ylinc );
772 else
773 TYPE->Fswap( &kbb, Mptr( X, k, Xjj, Xld, size ), &Xlinc,
774 Mptr( Y, kk, Yjj, Yld, size ), &Ylinc );
775 }
776 kk += kbb;
777 }
778 else
779 {
780 if( XisRow )
781 TYPE->Fset( &kbb, zero, Mptr( X, Xii, k, Xld, size ),
782 &Xlinc );
783 else
784 TYPE->Fset( &kbb, zero, Mptr( X, k, Xjj, Xld, size ),
785 &Xlinc );
786 }
787 Yroc = MModAdd1( Yroc, YnprocsD );
788 }
789 }
790 else
791 {
792/*
793* If I don't own any of sub( Y ), then just zero sub( X )
794*/
795 TYPE->Fset( &N, zero, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc );
796 }
797/*
798* Replicate locally scattered sub( X ) by reducing it in the process scope of
799* sub( Y )
800*/
801 scope = ( YisRow ? CROW : CCOLUMN );
802 top = PB_Ctop( &ctxt, COMBINE, &scope, TOP_GET );
803 if( XisRow )
804 TYPE->Cgsum2d( ctxt, &scope, top, 1, N, Mptr( X, Xii, Xjj, Xld,
805 size ), Xld, -1, 0 );
806 else
807 TYPE->Cgsum2d( ctxt, &scope, top, N, 1, Mptr( X, Xii, Xjj, Xld,
808 size ), Xld, -1, 0 );
809 }
810
811 if( !YisR )
812 {
813/*
814* If sub( Y ) is not replicated, then broadcast the result to the other pro-
815* cesses that own a piece of sub( X ), but were not involved in the above swap
816* operation.
817*/
818 if( XisRow ) { Xm = 1; Xn = N; }
819 else { Xm = N; Xn = 1; }
820
821 if( YisRow )
822 {
823 top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
824 if( YmyprocR == YprocR )
825 TYPE->Cgebs2d( ctxt, COLUMN, top, Xm, Xn, Mptr( X, Xii, Xjj, Xld,
826 size ), Xld );
827 else
828 TYPE->Cgebr2d( ctxt, COLUMN, top, Xm, Xn, Mptr( X, Xii, Xjj, Xld,
829 size ), Xld, YprocR, YmyprocD );
830 }
831 else
832 {
833 top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
834 if( YmyprocR == YprocR )
835 TYPE->Cgebs2d( ctxt, ROW, top, Xm, Xn, Mptr( X, Xii, Xjj, Xld,
836 size ), Xld );
837 else
838 TYPE->Cgebr2d( ctxt, ROW, top, Xm, Xn, Mptr( X, Xii, Xjj, Xld,
839 size ), Xld, YmyprocD, YprocR );
840 }
841 }
842 }
843/*
844* End of PB_CpswapND
845*/
846}
#define Int
Definition Bconfig.h:22
#define CCOLUMN
Definition PBblacs.h:20
#define TOP_GET
Definition PBblacs.h:50
#define COLUMN
Definition PBblacs.h:45
#define COMBINE
Definition PBblacs.h:49
#define CROW
Definition PBblacs.h:21
#define ROW
Definition PBblacs.h:46
void Cblacs_gridinfo()
#define BCAST
Definition PBblacs.h:48
#define CTXT_
Definition PBtools.h:38
Int PB_Cfirstnb()
#define MB_
Definition PBtools.h:43
char * PB_Cmalloc()
void PB_Cinfog2l()
#define MModSub(I1, I2, d)
Definition PBtools.h:102
#define MIN(a_, b_)
Definition PBtools.h:76
#define Mptr(a_, i_, j_, lda_, siz_)
Definition PBtools.h:132
#define LLD_
Definition PBtools.h:47
Int PB_Cnumroc()
char * PB_Ctop()
#define MModAdd1(I, d)
Definition PBtools.h:100
#define M_
Definition PBtools.h:39
#define MModAdd(I1, I2, d)
Definition PBtools.h:97
#define INB_
Definition PBtools.h:42
#define IMB_
Definition PBtools.h:41
void PB_CpswapND()
#define NB_
Definition PBtools.h:44
#define TYPE
Definition clamov.c:7