ScaLAPACK  2.0.2
ScaLAPACK: Scalable Linear Algebra PACKage
pcdotc_.c
Go to the documentation of this file.
00001 /* ---------------------------------------------------------------------
00002 *
00003 *  -- PBLAS routine (version 2.0) --
00004 *     University of Tennessee, Knoxville, Oak Ridge National Laboratory,
00005 *     and University of California, Berkeley.
00006 *     April 1, 1998
00007 *
00008 *  ---------------------------------------------------------------------
00009 */
00010 /*
00011 *  Include files
00012 */
00013 #include "pblas.h"
00014 #include "PBpblas.h"
00015 #include "PBtools.h"
00016 #include "PBblacs.h"
00017 #include "PBblas.h"
00018 
00019 #ifdef __STDC__
00020 void pcdotc_( int * N,
00021               float * DOT,
00022               float * X, int * IX, int * JX, int * DESCX, int * INCX,
00023               float * Y, int * IY, int * JY, int * DESCY, int * INCY )
00024 #else
00025 void pcdotc_( N, DOT, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY )
00026 /*
00027 *  .. Scalar Arguments ..
00028 */
00029    int            * INCX, * INCY, * IX, * IY, * JX, * JY, * N;
00030    float          * DOT;
00031 /*
00032 *  .. Array Arguments ..
00033 */
00034    int            * DESCX, * DESCY;
00035    float          * X, * Y;
00036 #endif
00037 {
00038 /*
00039 *  Purpose
00040 *  =======
00041 *
00042 *  PCDOTC  forms the dot product of two subvectors,
00043 *
00044 *     DOT := sub( X )**H * sub( Y ),
00045 *
00046 *  where
00047 *
00048 *     sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X,
00049 *                      X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and,
00050 *
00051 *     sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y,
00052 *                      Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y.
00053 *
00054 *  Notes
00055 *  =====
00056 *
00057 *  A description  vector  is associated with each 2D block-cyclicly dis-
00058 *  tributed matrix.  This  vector  stores  the  information  required to
00059 *  establish the  mapping  between a  matrix entry and its corresponding
00060 *  process and memory location.
00061 *
00062 *  In  the  following  comments,   the character _  should  be  read  as
00063 *  "of  the  distributed  matrix".  Let  A  be a generic term for any 2D
00064 *  block cyclicly distributed matrix.  Its description vector is DESC_A:
00065 *
00066 *  NOTATION         STORED IN       EXPLANATION
00067 *  ---------------- --------------- ------------------------------------
00068 *  DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
00069 *  CTXT_A  (global) DESCA[ CTXT_  ] The BLACS context handle, indicating
00070 *                                   the NPROW x NPCOL BLACS process grid
00071 *                                   A  is  distributed over. The context
00072 *                                   itself  is  global,  but  the handle
00073 *                                   (the integer value) may vary.
00074 *  M_A     (global) DESCA[ M_     ] The  number of rows in the distribu-
00075 *                                   ted matrix A, M_A >= 0.
00076 *  N_A     (global) DESCA[ N_     ] The number of columns in the distri-
00077 *                                   buted matrix A, N_A >= 0.
00078 *  IMB_A   (global) DESCA[ IMB_   ] The number of rows of the upper left
00079 *                                   block of the matrix A, IMB_A > 0.
00080 *  INB_A   (global) DESCA[ INB_   ] The  number  of columns of the upper
00081 *                                   left   block   of   the  matrix   A,
00082 *                                   INB_A > 0.
00083 *  MB_A    (global) DESCA[ MB_    ] The blocking factor used to  distri-
00084 *                                   bute the last  M_A-IMB_A  rows of A,
00085 *                                   MB_A > 0.
00086 *  NB_A    (global) DESCA[ NB_    ] The blocking factor used to  distri-
00087 *                                   bute the last  N_A-INB_A  columns of
00088 *                                   A, NB_A > 0.
00089 *  RSRC_A  (global) DESCA[ RSRC_  ] The process row over which the first
00090 *                                   row of the matrix  A is distributed,
00091 *                                   NPROW > RSRC_A >= 0.
00092 *  CSRC_A  (global) DESCA[ CSRC_  ] The  process column  over  which the
00093 *                                   first column of  A  is  distributed.
00094 *                                   NPCOL > CSRC_A >= 0.
00095 *  LLD_A   (local)  DESCA[ LLD_   ] The  leading dimension  of the local
00096 *                                   array  storing  the  local blocks of
00097 *                                   the distributed matrix A,
00098 *                                   IF( Lc( 1, N_A ) > 0 )
00099 *                                      LLD_A >= MAX( 1, Lr( 1, M_A ) )
00100 *                                   ELSE
00101 *                                      LLD_A >= 1.
00102 *
00103 *  Let K be the number of  rows of a matrix A starting at the global in-
00104 *  dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
00105 *  that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
00106 *  receive if these K rows were distributed over NPROW processes.  If  K
00107 *  is the number of columns of a matrix  A  starting at the global index
00108 *  JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number  of co-
00109 *  lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would  receive if
00110 *  these K columns were distributed over NPCOL processes.
00111 *
00112 *  The values of Lr() and Lc() may be determined via a call to the func-
00113 *  tion PB_Cnumroc:
00114 *  Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
00115 *  Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
00116 *
00117 *  Arguments
00118 *  =========
00119 *
00120 *  N       (global input) INTEGER
00121 *          On entry,  N  specifies the  length of the  subvectors to  be
00122 *          multiplied. N must be at least zero.
00123 *
00124 *  DOT     (local output) COMPLEX array
00125 *          On exit, DOT  specifies the dot product of the two subvectors
00126 *          sub( X ) and sub( Y ) only in their scope (See below for fur-
00127 *          ther details).
00128 *
00129 *  X       (local input) COMPLEX array
00130 *          On entry, X is an array of dimension (LLD_X, Kx), where LLD_X
00131 *          is   at  least  MAX( 1, Lr( 1, IX ) )  when  INCX = M_X   and
00132 *          MAX( 1, Lr( 1, IX+N-1 ) )  otherwise,  and,  Kx  is  at least
00133 *          Lc( 1, JX+N-1 )  when  INCX = M_X  and Lc( 1, JX ) otherwise.
00134 *          Before  entry,  this  array contains the local entries of the
00135 *          matrix X.
00136 *
00137 *  IX      (global input) INTEGER
00138 *          On entry, IX  specifies X's global row index, which points to
00139 *          the beginning of the submatrix sub( X ).
00140 *
00141 *  JX      (global input) INTEGER
00142 *          On entry, JX  specifies X's global column index, which points
00143 *          to the beginning of the submatrix sub( X ).
00144 *
00145 *  DESCX   (global and local input) INTEGER array
00146 *          On entry, DESCX  is an integer array of dimension DLEN_. This
00147 *          is the array descriptor for the matrix X.
00148 *
00149 *  INCX    (global input) INTEGER
00150 *          On entry,  INCX   specifies  the  global  increment  for  the
00151 *          elements of  X.  Only two values of  INCX   are  supported in
00152 *          this version, namely 1 and M_X. INCX  must not be zero.
00153 *
00154 *  Y       (local input) COMPLEX array
00155 *          On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y
00156 *          is   at  least  MAX( 1, Lr( 1, IY ) )  when  INCY = M_Y   and
00157 *          MAX( 1, Lr( 1, IY+N-1 ) )  otherwise,  and,  Ky  is  at least
00158 *          Lc( 1, JY+N-1 )  when  INCY = M_Y  and Lc( 1, JY ) otherwise.
00159 *          Before  entry,  this array  contains the local entries of the
00160 *          matrix Y.
00161 *
00162 *  IY      (global input) INTEGER
00163 *          On entry, IY  specifies Y's global row index, which points to
00164 *          the beginning of the submatrix sub( Y ).
00165 *
00166 *  JY      (global input) INTEGER
00167 *          On entry, JY  specifies Y's global column index, which points
00168 *          to the beginning of the submatrix sub( Y ).
00169 *
00170 *  DESCY   (global and local input) INTEGER array
00171 *          On entry, DESCY  is an integer array of dimension DLEN_. This
00172 *          is the array descriptor for the matrix Y.
00173 *
00174 *  INCY    (global input) INTEGER
00175 *          On entry,  INCY   specifies  the  global  increment  for  the
00176 *          elements of  Y.  Only two values of  INCY   are  supported in
00177 *          this version, namely 1 and M_Y. INCY  must not be zero.
00178 *
00179 *  Further Details
00180 *  ===============
00181 *
00182 *  When  the  result  of  a vector-oriented PBLAS call is a scalar, this
00183 *  scalar  is set only within the process scope which owns the vector(s)
00184 *  being operated on. Let sub( X ) be a generic term for the input  vec-
00185 *  tor(s). Then, the processes owning the correct the answer is determi-
00186 *  ned as follows:  if  an  operation involves more than one vector, the
00187 *  processes receiving the result will be the union of the following set
00188 *  of processes for each vector:
00189 *
00190 *  If N = 1, M_X = 1 and INCX = 1,  then  one cannot determine if a pro-
00191 *  cess  row  or  process column owns the vector operand, therefore only
00192 *  the process owning sub( X ) receives the correct result;
00193 *
00194 *  If  INCX = M_X, then sub( X )  is a vector distributed over a process
00195 *  row. Each process in this row receives the result;
00196 *
00197 *  If  INCX = 1, then  sub( X )  is  a vector distributed over a process
00198 *  column. Each process in this column receives the result;
00199 *
00200 *  -- Written on April 1, 1998 by
00201 *     Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
00202 *
00203 *  ---------------------------------------------------------------------
00204 */
00205 /*
00206 *  .. Local Scalars ..
00207 */
00208    char           scope, * top;
00209    int            OneBlock, OneDgrid, RRorCC, Square, Xcol, Xi, Xii, XinbD,
00210                   Xinb1D, XisD, XisR, XisRow, Xj, Xjj, Xld, Xlinc, XmyprocD,
00211                   XmyprocR, XnbD, XnpD, XnprocsD, XnprocsR, XprocD, XprocR,
00212                   Xrow, Ycol, Yi, Yii, YinbD, Yinb1D, YisD, YisR, YisRow, Yj,
00213                   Yjj, Yld, Ylinc, YmyprocD, YmyprocR, YnbD, YnpD, YnprocsD,
00214                   YnprocsR, YprocD, YprocR, Yrow, cdst, csrc, ctxt, dst, info,
00215                   ione=1, mycol, myrow, npcol, nprow, rdst, rsrc, size, src;
00216    PBTYP_T        * type;
00217    VVDOT_T        dot;
00218 /*
00219 *  .. Local Arrays ..
00220 */
00221    char           * buf = NULL;
00222    int            Xd[DLEN_], Yd[DLEN_], dbuf[ DLEN_ ];
00223 /* ..
00224 *  .. Executable Statements ..
00225 *
00226 */
00227    PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd );
00228    PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd );
00229 #ifndef NO_ARGCHK
00230 /*
00231 *  Test the input parameters
00232 */
00233    Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
00234    if( !( info = ( ( nprow == -1 ) ? -( 601 + CTXT_ ) : 0 ) ) )
00235    {
00236       PB_Cchkvec( ctxt, "PCDOTC", "X", *N, 1, Xi, Xj, Xd, *INCX,  6, &info );
00237       PB_Cchkvec( ctxt, "PCDOTC", "Y", *N, 1, Yi, Yj, Yd, *INCY, 11, &info );
00238    }
00239    if( info ) { PB_Cabort( ctxt, "PCDOTC", info ); return; }
00240 #endif
00241    DOT[REAL_PART] = ZERO;
00242    DOT[IMAG_PART] = ZERO;
00243 /*
00244 *  Quick return if possible
00245 */
00246    if( *N == 0 ) return;
00247 /*
00248 *  Handle degenerate case
00249 */
00250    if( ( *N == 1 ) && ( ( Xd[ M_ ] == 1 ) || ( Yd[ M_ ] == 1 ) ) )
00251    {
00252       type = PB_Cctypeset();
00253       PB_Cpdot11( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX,
00254                   ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotc );
00255       return;
00256    }
00257 /*
00258 *  Start the operations
00259 */
00260 #ifdef NO_ARGCHK
00261    Cblacs_gridinfo( ( ctxt = Xd[ CTXT_ ] ), &nprow, &npcol, &myrow, &mycol );
00262 #endif
00263 /*
00264 *  Determine if sub( X ) is distributed or not
00265 */
00266    if( ( XisRow = ( *INCX == Xd[M_] ) ) != 0 )
00267       XisD = ( ( Xd[CSRC_] >= 0 ) && ( ( XnprocsD = npcol ) > 1 ) );
00268    else
00269       XisD = ( ( Xd[RSRC_] >= 0 ) && ( ( XnprocsD = nprow ) > 1 ) );
00270 /*
00271 *  Determine if sub( Y ) is distributed or not
00272 */
00273    if( ( YisRow = ( *INCY == Yd[M_] ) ) != 0 )
00274       YisD = ( ( Yd[CSRC_] >= 0 ) && ( ( YnprocsD = npcol ) > 1 ) );
00275    else
00276       YisD = ( ( Yd[RSRC_] >= 0 ) && ( ( YnprocsD = nprow ) > 1 ) );
00277 /*
00278 *  Are sub( X ) and sub( Y ) both row or column vectors ?
00279 */
00280    RRorCC = ( ( XisRow && YisRow ) || ( !( XisRow ) && !( YisRow ) ) );
00281 /*
00282 *  XisD && YisD <=> both vector operands are indeed distributed
00283 */
00284    if( XisD && YisD )
00285    {
00286 /*
00287 *  Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol
00288 */
00289       PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj,
00290                    &Xrow, &Xcol );
00291       if( XisRow )
00292       {
00293          XinbD  = Xd[INB_]; XnbD     = Xd[NB_];
00294          Xld    = Xd[LLD_]; Xlinc    = Xld;
00295          XprocD = Xcol;     XmyprocD = mycol;
00296          XprocR = Xrow;     XmyprocR = myrow;   XnprocsR = nprow;
00297          XisR   = ( ( Xrow == -1 ) || ( XnprocsR == 1 ) );
00298          Mfirstnb( Xinb1D, *N, Xj, XinbD, XnbD );
00299       }
00300       else
00301       {
00302          XinbD  = Xd[IMB_]; XnbD     = Xd[MB_];
00303          Xld    = Xd[LLD_]; Xlinc    = 1;
00304          XprocD = Xrow;     XmyprocD = myrow;
00305          XprocR = Xcol;     XmyprocR = mycol;   XnprocsR = npcol;
00306          XisR   = ( ( Xcol == -1 ) || ( XnprocsR == 1 ) );
00307          Mfirstnb( Xinb1D, *N, Xi, XinbD, XnbD );
00308       }
00309 /*
00310 *  Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol
00311 */
00312       PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj,
00313                    &Yrow, &Ycol );
00314       if( YisRow )
00315       {
00316          YinbD  = Yd[INB_]; YnbD     = Yd[NB_];
00317          Yld    = Yd[LLD_]; Ylinc    = Yld;
00318          YprocD = Ycol;     YmyprocD = mycol;
00319          YprocR = Yrow;     YmyprocR = myrow;   YnprocsR = nprow;
00320          YisR   = ( ( Yrow == -1 ) || ( YnprocsR == 1 ) );
00321          Mfirstnb( Yinb1D, *N, Yj, YinbD, YnbD );
00322       }
00323       else
00324       {
00325          YinbD  = Yd[IMB_]; YnbD     = Yd[MB_];
00326          Yld    = Yd[LLD_]; Ylinc    = 1;
00327          YprocD = Yrow;     YmyprocD = myrow;
00328          YprocR = Ycol;     YmyprocR = mycol;   YnprocsR = npcol;
00329          YisR   = ( ( Ycol == -1 ) || ( YnprocsR == 1 ) );
00330          Mfirstnb( Yinb1D, *N, Yi, YinbD, YnbD );
00331       }
00332 /*
00333 *  Do sub( X ) and sub( Y ) span more than one process ?
00334 */
00335       OneDgrid = ( ( XnprocsD ==  1 ) && ( YnprocsD ==  1 ) );
00336       OneBlock = ( ( Xinb1D   >= *N ) && ( Yinb1D   >= *N ) );
00337 /*
00338 *  Are sub( X ) and sub( Y ) distributed in the same manner ?
00339 */
00340       Square   = ( ( Xinb1D   ==   Yinb1D ) && ( XnbD == YnbD ) &&
00341                    ( XnprocsD == YnprocsD ) );
00342 
00343       if( !( XisR ) )
00344       {
00345 /*
00346 *  sub( X ) is not replicated
00347 */
00348          if( YisR )
00349          {
00350 /*
00351 *  If sub( X ) is not replicated, but sub( Y ) is, a process row or column
00352 *  YprocR need to be selected. It will contain the non-replicated vector used
00353 *  to perform the dot product computation.
00354 */
00355             if( RRorCC )
00356             {
00357 /*
00358 *  sub( X ) and sub( Y ) are both row or column vectors
00359 */
00360                if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) )
00361                {
00362 /*
00363 *  sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD.
00364 *  Enforce a purely local operation by choosing YprocR to be equal to XprocR.
00365 */
00366                   YprocR = XprocR;
00367                }
00368                else
00369                {
00370 /*
00371 *  Otherwise, communication has to occur, so choose the next process row or
00372 *  column for YprocR to maximize the number of links, i.e reduce contention.
00373 */
00374                   YprocR = MModAdd1( XprocR, XnprocsR );
00375                }
00376             }
00377             else
00378             {
00379 /*
00380 *  sub( X ) and sub( Y ) are distributed in orthogonal directions, what is
00381 *  chosen for YprocR does not really matter. Select the process origin.
00382 */
00383                YprocR = XprocD;
00384             }
00385          }
00386          else
00387          {
00388 /*
00389 *  Neither sub( X ) nor sub( Y ) are replicated. If I am not in process row or
00390 *  column XprocR and not in process row or column YprocR, then quick return.
00391 */
00392             if( ( XmyprocR != XprocR ) && ( YmyprocR != YprocR ) )
00393                return;
00394          }
00395       }
00396       else
00397       {
00398 /*
00399 *  sub( X ) is distributed and replicated (so no quick return possible)
00400 */
00401          if( YisR )
00402          {
00403 /*
00404 *  sub( Y ) is distributed and replicated as well
00405 */
00406             if( RRorCC )
00407             {
00408 /*
00409 *  sub( X ) and sub( Y ) are both row or column vectors
00410 */
00411                if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) )
00412                {
00413 /*
00414 *  sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD.
00415 *  Enforce a purely local operation by choosing XprocR and YprocR to be equal
00416 *  to zero.
00417 */
00418                   XprocR = YprocR = 0;
00419                }
00420                else
00421                {
00422 /*
00423 *  Otherwise, communication has to occur, so select YprocR to be zero and the
00424 *  next process row or column for XprocR in order to maximize the number of
00425 *  used links, i.e reduce contention.
00426 */
00427                   YprocR = 0;
00428                   XprocR = MModAdd1( YprocR, YnprocsR );
00429                }
00430             }
00431             else
00432             {
00433 /*
00434 *  sub( X ) and sub( Y ) are distributed in orthogonal directions, select the
00435 *  origin processes.
00436 */
00437                XprocR = YprocD;
00438                YprocR = XprocD;
00439             }
00440          }
00441          else
00442          {
00443 /*
00444 *  sub( Y ) is distributed, but not replicated
00445 */
00446             if( RRorCC )
00447             {
00448 /*
00449 *  sub( X ) and sub( Y ) are both row or column vectors
00450 */
00451                if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) )
00452                {
00453 /*
00454 *  sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD.
00455 *  Enforce a purely local operation by choosing XprocR to be equal to YprocR.
00456 */
00457                   XprocR = YprocR;
00458                }
00459                else
00460                {
00461 /*
00462 *  Otherwise, communication has to occur, so choose the next process row or
00463 *  column for XprocR to maximize the number of links, i.e reduce contention.
00464 */
00465                   XprocR = MModAdd1( YprocR, YnprocsR );
00466                }
00467             }
00468             else
00469             {
00470 /*
00471 *  sub( X ) and sub( Y ) are distributed in orthogonal directions, what is
00472 *  chosen for XprocR does not really matter. Select the origin process.
00473 */
00474                XprocR = YprocD;
00475             }
00476          }
00477       }
00478 /*
00479 *  Even if sub( X ) and/or sub( Y ) are replicated, only two process row or
00480 *  column are active, namely XprocR and YprocR. If any of those operands is
00481 *  replicated, broadcast will occur (unless there is an easy way out).
00482 */
00483       type = PB_Cctypeset(); size = type->size; dot = type->Fvvdotc;
00484 /*
00485 *  A purely operation occurs iff the operands start in the same process and if
00486 *  either the grid is mono-dimensional or there is a single local block to be
00487 *  operated with or if both operands are aligned.
00488 */
00489       if( ( (    RRorCC   && ( XprocD == YprocD ) && ( XprocR == YprocR ) ) ||
00490             ( !( RRorCC ) && ( XprocD == YprocR ) && ( XprocR == YprocD ) ) ) &&
00491           ( OneDgrid || OneBlock || ( RRorCC && Square ) ) )
00492       {
00493          if( ( !XisR &&         ( XmyprocR == XprocR ) &&
00494                !YisR &&         ( YmyprocR == YprocR ) ) ||
00495              ( !XisR && YisR && ( YmyprocR == YprocR ) ) ||
00496              ( !YisR && XisR && ( XmyprocR == XprocR ) ) ||
00497              (  XisR && YisR                           ) )
00498          {
00499             XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD,
00500                                XnprocsD );
00501             YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD,
00502                                YnprocsD );
00503             if( ( XnpD > 0 ) && ( YnpD > 0 ) )
00504             {
00505                dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld,
00506                     size ), &Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld, size ),
00507                     &Ylinc );
00508             }
00509          }
00510 /*
00511 *  Combine the local results in sub( X )'s scope
00512 */
00513          if( ( XisR && YisR ) || ( XmyprocR == XprocR ) )
00514          {
00515             scope = ( XisRow ? CROW : CCOLUMN );
00516             top = PB_Ctop( &ctxt, COMBINE, &scope, TOP_GET );
00517             Ccgsum2d( ctxt, &scope, top, 1, 1, ((char *) DOT), 1, -1, 0 );
00518          }
00519          if( RRorCC && XisR && YisR ) return;
00520       }
00521       else if( ( RRorCC && OneDgrid ) || OneBlock || Square )
00522       {
00523 /*
00524 *  Otherwise, it may be possible to compute the desired dot-product in a single
00525 *  message exchange iff the grid is mono-dimensional and the operands are
00526 *  distributed in the same direction, or there is just one block to be exchanged
00527 *  or if both operands are similarly distributed in their respective direction.
00528 */
00529          if( ( YmyprocR == YprocR ) )
00530          {
00531 /*
00532 *  The processes owning a piece of sub( Y ) send it to the corresponding
00533 *  process owning s piece of sub ( X ).
00534 */
00535             YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD,
00536                                YnprocsD );
00537             if( YnpD > 0 )
00538             {
00539                dst = XprocD + MModSub( YmyprocD, YprocD, YnprocsD );
00540                dst = MPosMod( dst, XnprocsD );
00541                if( XisRow ) { rdst = XprocR; cdst = dst; }
00542                else         { rdst = dst; cdst = XprocR; }
00543 
00544                if( ( myrow == rdst ) && ( mycol == cdst ) )
00545                {
00546                   dot( &YnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld,
00547                        size ), &Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld,
00548                        size ), &Ylinc );
00549                }
00550                else
00551                {
00552                   if( YisRow )
00553                      Ccgesd2d( ctxt, 1, YnpD, Mptr( ((char *) Y), Yii, Yjj,
00554                                Yld, size ), Yld, rdst, cdst );
00555                   else
00556                      Ccgesd2d( ctxt, YnpD, 1, Mptr( ((char *) Y), Yii, Yjj,
00557                                Yld, size ), Yld, rdst, cdst );
00558                }
00559             }
00560          }
00561          if( XmyprocR == XprocR )
00562          {
00563 /*
00564 *  The processes owning a piece of sub( X ) receive the corresponding local
00565 *  piece of sub( Y ), compute the local dot product and combine the results
00566 *  within sub( X )'s scope.
00567 */
00568             XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD,
00569                                XnprocsD );
00570             if( XnpD > 0 )
00571             {
00572                src = YprocD + MModSub( XmyprocD, XprocD, XnprocsD );
00573                src = MPosMod( src, YnprocsD );
00574                if( YisRow ) { rsrc = YprocR; csrc = src; }
00575                else         { rsrc = src; csrc = YprocR; }
00576                if( ( myrow != rsrc ) || ( mycol != csrc ) )
00577                {
00578                   buf = PB_Cmalloc( XnpD * size );
00579                   if( YisRow )
00580                      Ccgerv2d( ctxt, 1, XnpD, buf,    1, rsrc, csrc );
00581                   else
00582                      Ccgerv2d( ctxt, XnpD, 1, buf, XnpD, rsrc, csrc );
00583                   dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld,
00584                        size ), &Xlinc, buf, &ione );
00585                   if( buf ) free( buf );
00586                }
00587             }
00588             if( XisRow )
00589             {
00590                top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
00591                Ccgsum2d( ctxt, ROW,    top, 1, 1, ((char*)DOT), 1, -1, 0 );
00592             }
00593             else
00594             {
00595                top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
00596                Ccgsum2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, -1, 0 );
00597             }
00598          }
00599       }
00600       else
00601       {
00602 /*
00603 *  General case, copy sub( Y ) within sub( X )'s scope, compute the local
00604 *  results and combine them within sub( X )'s scope.
00605 */
00606          XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD );
00607 
00608          if( XisRow )
00609          {
00610             PB_Cdescset( dbuf, 1, *N, 1, Xinb1D, 1, XnbD, XprocR, XprocD, ctxt,
00611                          1 );
00612          }
00613          else
00614          {
00615             PB_Cdescset( dbuf, *N, 1, Xinb1D, 1, XnbD, 1, XprocD, XprocR, ctxt,
00616                          MAX( 1, XnpD ) );
00617          }
00618          if( ( XmyprocR == XprocR ) && ( XnpD > 0 ) )
00619             buf = PB_Cmalloc( XnpD * size );
00620 
00621          if( YisRow )
00622          {
00623             PB_Cpaxpby( type, NOCONJG, 1, *N, type->one, ((char *) Y), Yi, Yj,
00624                         Yd, ROW, type->zero, buf, 0, 0, dbuf, ( XisRow ? ROW :
00625                         COLUMN ) );
00626          }
00627          else
00628          {
00629             PB_Cpaxpby( type, NOCONJG, *N, 1, type->one, ((char *) Y), Yi, Yj,
00630                         Yd, COLUMN, type->zero, buf, 0, 0, dbuf, ( XisRow ?
00631                         ROW : COLUMN ) );
00632          }
00633 
00634          if( XmyprocR == XprocR )
00635          {
00636             if( XnpD > 0 )
00637             {
00638                dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld,
00639                     size ), &Xlinc, buf, &ione );
00640                if( buf ) free( buf );
00641             }
00642             if( XisRow )
00643             {
00644                top = PB_Ctop( &ctxt, COMBINE, ROW,    TOP_GET );
00645                Ccgsum2d( ctxt, ROW,    top, 1, 1, ((char*)DOT), 1, -1, 0 );
00646             }
00647             else
00648             {
00649                top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
00650                Ccgsum2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, -1, 0 );
00651             }
00652          }
00653       }
00654 /*
00655 *  Send the DOT product result within sub( Y )'s scope
00656 */
00657       if( XisR || YisR )
00658       {
00659 /*
00660 *  Either sub( X ) or sub( Y ) are replicated, so that every process should have
00661 *  the result -> broadcast it orthogonally from sub( X )'s direction.
00662 */
00663          if( XisRow )
00664          {
00665            top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
00666            if( XmyprocR == XprocR )
00667               Ccgebs2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1 );
00668            else
00669               Ccgebr2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, XprocR,
00670                         XmyprocD );
00671          }
00672          else
00673          {
00674            top = PB_Ctop( &ctxt, BCAST, ROW,    TOP_GET );
00675            if( XmyprocR == XprocR )
00676               Ccgebs2d( ctxt, ROW,    top, 1, 1, ((char*)DOT), 1 );
00677            else
00678               Ccgebr2d( ctxt, ROW,    top, 1, 1, ((char*)DOT), 1, XmyprocD,
00679                         XprocR );
00680          }
00681       }
00682       else
00683       {
00684 /*
00685 *  Neither sub( X ) nor sub( Y ) are replicated
00686 */
00687          if( RRorCC )
00688          {
00689 /*
00690 *  Both sub( X ) are distributed in the same direction -> the process row or
00691 *  column XprocR sends the result to the process row or column YprocR.
00692 */
00693             if( XprocR != YprocR )
00694             {
00695                if( XmyprocR == XprocR )
00696                {
00697                   if( XisRow )
00698                      Ccgesd2d( ctxt, 1, 1, ((char *) DOT), 1, YprocR,
00699                                YmyprocD );
00700                   else
00701                      Ccgesd2d( ctxt, 1, 1, ((char *) DOT), 1, YmyprocD,
00702                                YprocR );
00703                }
00704                else if( YmyprocR == YprocR )
00705                {
00706                   if( XisRow )
00707                      Ccgerv2d( ctxt, 1, 1, ((char *) DOT), 1, XprocR,
00708                                XmyprocD );
00709                   else
00710                      Ccgerv2d( ctxt, 1, 1, ((char *) DOT), 1, XmyprocD,
00711                                XprocR );
00712                }
00713             }
00714          }
00715          else
00716          {
00717 /*
00718 *  Otherwise, the process at the intersection of sub( X )'s and sub( Y )'s
00719 *  scope, broadcast the result within sub( Y )'s scope.
00720 */
00721             if( YmyprocR == YprocR )
00722             {
00723                if( YisRow )
00724                {
00725                   top = PB_Ctop( &ctxt, BCAST, ROW,    TOP_GET );
00726                   if( YmyprocD == XprocR )
00727                      Ccgebs2d( ctxt, ROW,    top, 1, 1, ((char*)DOT), 1 );
00728                   else
00729                      Ccgebr2d( ctxt, ROW,    top, 1, 1, ((char*)DOT), 1,
00730                                YprocR, XprocR );
00731                }
00732                else
00733                {
00734                   top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
00735                   if( YmyprocD == XprocR )
00736                      Ccgebs2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1 );
00737                   else
00738                      Ccgebr2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1,
00739                                XprocR, YprocR );
00740                }
00741             }
00742          }
00743       }
00744    }
00745    else if( !( XisD ) && YisD )
00746    {
00747 /*
00748 *  sub( X ) is not distributed and sub( Y ) is distributed.
00749 */
00750       type = PB_Cctypeset();
00751       PB_CpdotND( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX,
00752                   ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotc );
00753    }
00754    else if( XisD && !( YisD ) )
00755    {
00756 /*
00757 *  sub( X ) is distributed and sub( Y ) is not distributed.
00758 */
00759       type = PB_Cctypeset();
00760 /*
00761 *     Compute DOT := sub( Y )**H * sub( X )
00762 */
00763       PB_CpdotND( type, *N, ((char *) DOT), ((char *) Y), Yi, Yj, Yd, *INCY,
00764                   ((char *) X), Xi, Xj, Xd, *INCX, type->Fvvdotc );
00765 /*
00766 *     Conjugate the result
00767 */
00768       DOT[IMAG_PART] = -DOT[IMAG_PART];
00769    }
00770    else
00771    {
00772 /*
00773 *     Neither sub( X ) nor sub( Y ) are distributed
00774 */
00775       type = PB_Cctypeset();
00776       PB_CpdotNN( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX,
00777                   ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotc );
00778    }
00779 /*
00780 *  End of PCDOTC
00781 */
00782 }