ScaLAPACK  2.0.2
ScaLAPACK: Scalable Linear Algebra PACKage
PB_CpdotNN.c
Go to the documentation of this file.
00001 /* ---------------------------------------------------------------------
00002 *
00003 *  -- PBLAS auxiliary routine (version 2.0) --
00004 *     University of Tennessee, Knoxville, Oak Ridge National Laboratory,
00005 *     and University of California, Berkeley.
00006 *     April 1, 1998
00007 *
00008 *  ---------------------------------------------------------------------
00009 */
00010 /*
00011 *  Include files
00012 */
00013 #include "../pblas.h"
00014 #include "../PBpblas.h"
00015 #include "../PBtools.h"
00016 #include "../PBblacs.h"
00017 #include "../PBblas.h"
00018 
00019 #ifdef __STDC__
00020 void PB_CpdotNN( PBTYP_T * TYPE, int N, char * DOT,
00021                  char * X, int IX, int JX, int * DESCX, int INCX,
00022                  char * Y, int IY, int JY, int * DESCY, int INCY,
00023                  VVDOT_T FDOT )
00024 #else
00025 void PB_CpdotNN( TYPE, N, DOT, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY,
00026                  INCY, FDOT )
00027 /*
00028 *  .. Scalar Arguments ..
00029 */
00030    int            INCX, INCY, IX, IY, JX, JY, N;
00031    char           * DOT;
00032    PBTYP_T        * TYPE;
00033    VVDOT_T        FDOT;
00034 /*
00035 *  .. Array Arguments ..
00036 */
00037    int            * DESCX, * DESCY;
00038    char           * X, * Y;
00039 #endif
00040 {
00041 /*
00042 *  Purpose
00043 *  =======
00044 *
00045 *  PB_CpdotNN forms the dot product of two subvectors,
00046 *
00047 *     DOT := sub( X )**T * sub( Y )  or   DOT := sub( X )**H * sub( Y ),
00048 *
00049 *  where
00050 *
00051 *     sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X,
00052 *                      X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and,
00053 *
00054 *     sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y,
00055 *                      Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y.
00056 *
00057 *  Both subvectors are assumed to be not distributed.
00058 *
00059 *  Notes
00060 *  =====
00061 *
00062 *  A description  vector  is associated with each 2D block-cyclicly dis-
00063 *  tributed matrix.  This  vector  stores  the  information  required to
00064 *  establish the  mapping  between a  matrix entry and its corresponding
00065 *  process and memory location.
00066 *
00067 *  In  the  following  comments,   the character _  should  be  read  as
00068 *  "of  the  distributed  matrix".  Let  A  be a generic term for any 2D
00069 *  block cyclicly distributed matrix.  Its description vector is DESC_A:
00070 *
00071 *  NOTATION         STORED IN       EXPLANATION
00072 *  ---------------- --------------- ------------------------------------
00073 *  DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
00074 *  CTXT_A  (global) DESCA[ CTXT_  ] The BLACS context handle, indicating
00075 *                                   the NPROW x NPCOL BLACS process grid
00076 *                                   A  is  distributed over. The context
00077 *                                   itself  is  global,  but  the handle
00078 *                                   (the integer value) may vary.
00079 *  M_A     (global) DESCA[ M_     ] The  number of rows in the distribu-
00080 *                                   ted matrix A, M_A >= 0.
00081 *  N_A     (global) DESCA[ N_     ] The number of columns in the distri-
00082 *                                   buted matrix A, N_A >= 0.
00083 *  IMB_A   (global) DESCA[ IMB_   ] The number of rows of the upper left
00084 *                                   block of the matrix A, IMB_A > 0.
00085 *  INB_A   (global) DESCA[ INB_   ] The  number  of columns of the upper
00086 *                                   left   block   of   the  matrix   A,
00087 *                                   INB_A > 0.
00088 *  MB_A    (global) DESCA[ MB_    ] The blocking factor used to  distri-
00089 *                                   bute the last  M_A-IMB_A  rows of A,
00090 *                                   MB_A > 0.
00091 *  NB_A    (global) DESCA[ NB_    ] The blocking factor used to  distri-
00092 *                                   bute the last  N_A-INB_A  columns of
00093 *                                   A, NB_A > 0.
00094 *  RSRC_A  (global) DESCA[ RSRC_  ] The process row over which the first
00095 *                                   row of the matrix  A is distributed,
00096 *                                   NPROW > RSRC_A >= 0.
00097 *  CSRC_A  (global) DESCA[ CSRC_  ] The  process column  over  which the
00098 *                                   first column of  A  is  distributed.
00099 *                                   NPCOL > CSRC_A >= 0.
00100 *  LLD_A   (local)  DESCA[ LLD_   ] The  leading dimension  of the local
00101 *                                   array  storing  the  local blocks of
00102 *                                   the distributed matrix A,
00103 *                                   IF( Lc( 1, N_A ) > 0 )
00104 *                                      LLD_A >= MAX( 1, Lr( 1, M_A ) )
00105 *                                   ELSE
00106 *                                      LLD_A >= 1.
00107 *
00108 *  Let K be the number of  rows of a matrix A starting at the global in-
00109 *  dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
00110 *  that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
00111 *  receive if these K rows were distributed over NPROW processes.  If  K
00112 *  is the number of columns of a matrix  A  starting at the global index
00113 *  JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number  of co-
00114 *  lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would  receive if
00115 *  these K columns were distributed over NPCOL processes.
00116 *
00117 *  The values of Lr() and Lc() may be determined via a call to the func-
00118 *  tion PB_Cnumroc:
00119 *  Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
00120 *  Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
00121 *
00122 *  Arguments
00123 *  =========
00124 *
00125 *  TYPE    (local input) pointer to a PBTYP_T structure
00126 *          On entry,  TYPE  is a pointer to a structure of type PBTYP_T,
00127 *          that contains type information (See pblas.h).
00128 *
00129 *  N       (global input) INTEGER
00130 *          On entry,  N  specifies the  length of the  subvectors to  be
00131 *          multiplied. N must be at least zero.
00132 *
00133 *  DOT     (local output) pointer to CHAR
00134 *          On exit, DOT  specifies the dot product of the two subvectors
00135 *          sub( X ) and sub( Y ) only in their scope (See below for fur-
00136 *          ther details).
00137 *
00138 *  X       (local input) pointer to CHAR
00139 *          On entry, X is an array of dimension (LLD_X, Kx), where LLD_X
00140 *          is   at  least  MAX( 1, Lr( 1, IX ) )  when  INCX = M_X   and
00141 *          MAX( 1, Lr( 1, IX+N-1 ) )  otherwise,  and,  Kx  is  at least
00142 *          Lc( 1, JX+N-1 )  when  INCX = M_X  and Lc( 1, JX ) otherwise.
00143 *          Before  entry,  this  array contains the local entries of the
00144 *          matrix X.
00145 *
00146 *  IX      (global input) INTEGER
00147 *          On entry, IX  specifies X's global row index, which points to
00148 *          the beginning of the submatrix sub( X ).
00149 *
00150 *  JX      (global input) INTEGER
00151 *          On entry, JX  specifies X's global column index, which points
00152 *          to the beginning of the submatrix sub( X ).
00153 *
00154 *  DESCX   (global and local input) INTEGER array
00155 *          On entry, DESCX  is an integer array of dimension DLEN_. This
00156 *          is the array descriptor for the matrix X.
00157 *
00158 *  INCX    (global input) INTEGER
00159 *          On entry,  INCX   specifies  the  global  increment  for  the
00160 *          elements of  X.  Only two values of  INCX   are  supported in
00161 *          this version, namely 1 and M_X. INCX  must not be zero.
00162 *
00163 *  Y       (local input) pointer to CHAR
00164 *          On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y
00165 *          is   at  least  MAX( 1, Lr( 1, IY ) )  when  INCY = M_Y   and
00166 *          MAX( 1, Lr( 1, IY+N-1 ) )  otherwise,  and,  Ky  is  at least
00167 *          Lc( 1, JY+N-1 )  when  INCY = M_Y  and Lc( 1, JY ) otherwise.
00168 *          Before  entry,  this array  contains the local entries of the
00169 *          matrix Y.
00170 *
00171 *  IY      (global input) INTEGER
00172 *          On entry, IY  specifies Y's global row index, which points to
00173 *          the beginning of the submatrix sub( Y ).
00174 *
00175 *  JY      (global input) INTEGER
00176 *          On entry, JY  specifies Y's global column index, which points
00177 *          to the beginning of the submatrix sub( Y ).
00178 *
00179 *  DESCY   (global and local input) INTEGER array
00180 *          On entry, DESCY  is an integer array of dimension DLEN_. This
00181 *          is the array descriptor for the matrix Y.
00182 *
00183 *  INCY    (global input) INTEGER
00184 *          On entry,  INCY   specifies  the  global  increment  for  the
00185 *          elements of  Y.  Only two values of  INCY   are  supported in
00186 *          this version, namely 1 and M_Y. INCY  must not be zero.
00187 *
00188 *  FDOT    (local input) pointer to a function of type VVDOT
00189 *          On entry, FDOT points to a subroutine that computes the local
00190 *          dot product of two vectors.
00191 *
00192 *  Further Details
00193 *  ===============
00194 *
00195 *  When  the  result  of  a vector-oriented PBLAS call is a scalar, this
00196 *  scalar  is set only within the process scope which owns the vector(s)
00197 *  being operated on. Let sub( X ) be a generic term for the input  vec-
00198 *  tor(s). Then, the processes owning the correct the answer is determi-
00199 *  ned as follows:  if  an  operation involves more than one vector, the
00200 *  processes receiving the result will be the union of the following set
00201 *  of processes for each vector:
00202 *
00203 *  If N = 1, M_X = 1 and INCX = 1,  then  one cannot determine if a pro-
00204 *  cess  row  or  process column owns the vector operand, therefore only
00205 *  the process owning sub( X ) receives the correct result;
00206 *
00207 *  If  INCX = M_X, then sub( X )  is a vector distributed over a process
00208 *  row. Each process in this row receives the result;
00209 *
00210 *  If  INCX = 1, then  sub( X )  is  a vector distributed over a process
00211 *  column. Each process in this column receives the result;
00212 *
00213 *  -- Written on April 1, 1998 by
00214 *     Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
00215 *
00216 *  ---------------------------------------------------------------------
00217 */
00218 /*
00219 *  .. Local Scalars ..
00220 */
00221    char           Xscope, Yscope, * top;
00222    int            RRorCC, Xcol, Xii, XisR, XisRow, Xjj, Xld, Xlinc, XmyprocD,
00223                   XmyprocR, XnprocsR, XprocR, Xrow, Ycol, Yii, YisR, YisRow,
00224                   Yjj, Yld, Ylinc, YmyprocD, YmyprocR, YnprocsR, YprocR, Yrow,
00225                   csrc, ctxt, ione=1, mycol, myrow, npcol, nprow, rsrc, size;
00226 /*
00227 *  .. Local Arrays ..
00228 */
00229    char           * buf = NULL;
00230 /* ..
00231 *  .. Executable Statements ..
00232 *
00233 */
00234 /*
00235 *  Retrieve process grid information
00236 */
00237    Cblacs_gridinfo( ( ctxt = DESCX[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
00238 /*
00239 *  Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol ...
00240 */
00241    PB_Cinfog2l( IX, JX, DESCX, nprow, npcol, myrow, mycol, &Xii, &Xjj,
00242                 &Xrow, &Xcol );
00243    if( ( XisRow = ( INCX == DESCX[M_] ) ) != 0 )
00244    {
00245       Xld      = DESCX[LLD_];          Xlinc    = Xld;
00246       XmyprocD = mycol; XprocR = Xrow; XmyprocR = myrow; XnprocsR = nprow;
00247       XisR     = ( ( Xrow == -1 ) || ( XnprocsR == 1 ) );
00248    }
00249    else
00250    {
00251       Xld      = DESCX[LLD_];          Xlinc    = 1;
00252       XmyprocD = myrow; XprocR = Xcol; XmyprocR = mycol; XnprocsR = npcol;
00253       XisR     = ( ( Xcol == -1 ) || ( XnprocsR == 1 ) );
00254    }
00255 /*
00256 *  Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol ...
00257 */
00258    PB_Cinfog2l( IY, JY, DESCY, nprow, npcol, myrow, mycol, &Yii, &Yjj,
00259                 &Yrow, &Ycol );
00260    if( ( YisRow = ( INCY == DESCY[M_] ) ) != 0 )
00261    {
00262       Yld      = DESCY[LLD_];          Ylinc    = Yld;
00263       YmyprocD = mycol; YprocR = Yrow; YmyprocR = myrow; YnprocsR = nprow;
00264       YisR     = ( ( Yrow == -1 ) || ( YnprocsR == 1 ) );
00265    }
00266    else
00267    {
00268       Yld      = DESCY[LLD_];          Ylinc    = 1;
00269       YmyprocD = myrow; YprocR = Ycol; YmyprocR = mycol; YnprocsR = npcol;
00270       YisR     = ( ( Ycol == -1 ) || ( YnprocsR == 1 ) );
00271    }
00272 /*
00273 *  Are sub( X ) and sub( Y ) both row or column vectors ?
00274 */
00275    RRorCC = ( ( XisRow && YisRow ) || ( !( XisRow ) && !( YisRow ) ) );
00276 /*
00277 *  Neither sub( X ) nor sub( Y ) are distributed
00278 */
00279    if( !XisR )
00280    {
00281 /*
00282 *     sub( X ) is not replicated
00283 */
00284       if( !( YisR ) )
00285       {
00286 /*
00287 *  sub( Y ) is not replicated
00288 */
00289          if( ( XmyprocR != XprocR ) && ( YmyprocR != YprocR ) )
00290 /*
00291 *  If I am not in XprocR or YprocR, then return immediately
00292 */
00293             return;
00294 
00295          size = TYPE->size;
00296 
00297          if( RRorCC )
00298          {
00299 /*
00300 *  sub( X ) and sub( Y ) are both row or column vectors
00301 */
00302             if( XprocR == YprocR )
00303             {
00304 /*
00305 *  sub( X ) and sub( Y ) are in the same process row or column
00306 */
00307                FDOT( &N, DOT, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, Mptr( Y,
00308                      Yii, Yjj, Yld, size ), &Ylinc );
00309             }
00310             else
00311             {
00312 /*
00313 *  sub( X ) and sub( Y ) are in a different process row or column
00314 */
00315                if( XmyprocR == XprocR )
00316                {
00317                   buf = PB_Cmalloc( N * size );
00318 /*
00319 *  Send sub( X ) to where sub( Y ) resides, and receive sub( Y ) from the same
00320 *  location.
00321 */
00322                   if( XisRow )
00323                   {
00324                      TYPE->Cgesd2d( ctxt, 1, N, Mptr( X, Xii, Xjj, Xld, size ),
00325                                     Xld, YprocR, XmyprocD );
00326                      TYPE->Cgerv2d( ctxt, 1, N, buf, 1, YprocR, XmyprocD );
00327                   }
00328                   else
00329                   {
00330                      TYPE->Cgesd2d( ctxt, N, 1, Mptr( X, Xii, Xjj, Xld, size ),
00331                                     Xld, XmyprocD, YprocR );
00332                      TYPE->Cgerv2d( ctxt, N, 1, buf, N, XmyprocD, YprocR );
00333                   }
00334                   FDOT( &N, DOT, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, buf,
00335                         &ione );
00336                   if( buf ) free( buf );
00337                }
00338 
00339                if( YmyprocR == YprocR )
00340                {
00341                   buf = PB_Cmalloc( N * size );
00342 /*
00343 *  Send sub( Y ) to where sub( X ) resides, and receive sub( X ) from the same
00344 *  location.
00345 */
00346                   if( YisRow )
00347                   {
00348                      TYPE->Cgesd2d( ctxt, 1, N, Mptr( Y, Yii, Yjj, Yld, size ),
00349                                     Yld, XprocR, YmyprocD );
00350                      TYPE->Cgerv2d( ctxt, 1, N, buf, 1, XprocR, YmyprocD );
00351                   }
00352                   else
00353                   {
00354                      TYPE->Cgesd2d( ctxt, N, 1, Mptr( Y, Yii, Yjj, Yld, size ),
00355                                     Yld, YmyprocD, XprocR );
00356                      TYPE->Cgerv2d( ctxt, N, 1, buf, N, YmyprocD, XprocR );
00357                   }
00358                   FDOT( &N, DOT, buf, &ione, Mptr( Y, Yii, Yjj, Yld, size ),
00359                         &Ylinc );
00360                   if( buf ) free( buf );
00361                }
00362             }
00363          }
00364          else
00365          {
00366 /*
00367 *  sub( X ) and sub( Y ) are not both row or column vectors
00368 */
00369             if( ( XmyprocR == XprocR ) && ( YmyprocR == YprocR ) )
00370             {
00371 /*
00372 *  If I am at the intersection of the process row and column, then compute the
00373 *  dot product and broadcast it in my process row and column.
00374 */
00375                FDOT( &N, DOT, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, Mptr( Y,
00376                      Yii, Yjj, Yld, size ), &Ylinc );
00377                top = PB_Ctop( &ctxt, BCAST, ROW,    TOP_GET );
00378                TYPE->Cgebs2d( ctxt, ROW,    top, 1, 1, DOT, 1 );
00379                top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
00380                TYPE->Cgebs2d( ctxt, COLUMN, top, 1, 1, DOT, 1 );
00381             }
00382             else if( XmyprocR == XprocR )
00383             {
00384                if( XisRow ) { Xscope = CROW; rsrc = XprocR; csrc = YprocR; }
00385                else      { Xscope = CCOLUMN; rsrc = YprocR; csrc = XprocR; }
00386                top = PB_Ctop( &ctxt, BCAST, &Xscope, TOP_GET );
00387                TYPE->Cgebr2d( ctxt, &Xscope, top, 1, 1, DOT, 1, rsrc, csrc );
00388             }
00389             else if( YmyprocR == YprocR )
00390             {
00391                if( YisRow ) { Yscope = CROW; rsrc = YprocR; csrc = XprocR; }
00392                else      { Yscope = CCOLUMN; rsrc = XprocR; csrc = YprocR; }
00393                top = PB_Ctop( &ctxt, BCAST, &Yscope, TOP_GET );
00394                TYPE->Cgebr2d( ctxt, &Yscope, top, 1, 1, DOT, 1, rsrc, csrc );
00395             }
00396          }
00397       }
00398       else
00399       {
00400 /*
00401 *  sub( Y ) is replicated
00402 */
00403          if( XmyprocR == XprocR )
00404          {
00405 /*
00406 *  If I am in the process row (resp. column) owning sub( X ), then compute the
00407 *  dot product and broadcast in my column (resp. row).
00408 */
00409             size = TYPE->size;
00410 
00411             FDOT( &N, DOT, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, Mptr( Y, Yii,
00412                   Yjj, Yld, size ), &Ylinc );
00413 
00414             if( XisRow )
00415             {
00416                top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
00417                TYPE->Cgebs2d( ctxt, COLUMN, top, 1, 1, DOT, 1 );
00418             }
00419             else
00420             {
00421                top = PB_Ctop( &ctxt, BCAST, ROW,    TOP_GET );
00422                TYPE->Cgebs2d( ctxt, ROW,    top, 1, 1, DOT, 1 );
00423             }
00424          }
00425          else
00426          {
00427 /*
00428 *  Otherwise, receive the dot product
00429 */
00430             if( XisRow )
00431             {
00432                top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
00433                TYPE->Cgebr2d( ctxt, COLUMN, top, 1, 1, DOT, 1, XprocR,
00434                               XmyprocD );
00435             }
00436             else
00437             {
00438                top = PB_Ctop( &ctxt, BCAST, ROW,    TOP_GET );
00439                TYPE->Cgebr2d( ctxt, ROW,    top, 1, 1, DOT, 1, XmyprocD,
00440                               XprocR );
00441             }
00442          }
00443       }
00444    }
00445    else
00446    {
00447 /*
00448 *  sub( X ) is replicated
00449 */
00450       if( YisR || ( YmyprocR ==  YprocR ) )
00451       {
00452 /*
00453 *  If I own a piece of sub( Y ), then compute the dot product
00454 */
00455          size = TYPE->size;
00456 
00457          FDOT( &N, DOT, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, Mptr( Y, Yii,
00458                Yjj, Yld, size ), &Ylinc );
00459       }
00460 
00461       if( !YisR )
00462       {
00463 /*
00464 *  If sub( Y ) is not replicated, then broadcast the result to the other
00465 *  processes that own a piece of sub( X ), but were not involved in the above
00466 *  dot-product computation.
00467 */
00468          if( YisRow )
00469          {
00470             top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
00471             if( YmyprocR == YprocR )
00472                TYPE->Cgebs2d( ctxt, COLUMN, top, 1, 1, DOT, 1 );
00473             else
00474                TYPE->Cgebr2d( ctxt, COLUMN, top, 1, 1, DOT, 1, YprocR,
00475                               YmyprocD );
00476          }
00477          else
00478          {
00479             top = PB_Ctop( &ctxt, BCAST, ROW,    TOP_GET );
00480             if( YmyprocR == YprocR )
00481                TYPE->Cgebs2d( ctxt, ROW, top, 1, 1, DOT, 1 );
00482             else
00483                TYPE->Cgebr2d( ctxt, ROW, top, 1, 1, DOT, 1, YmyprocD, YprocR );
00484          }
00485       }
00486    }
00487 /*
00488 *  End of PB_CpdotNN
00489 */
00490 }