#include #include #include #include "mpl_blas.h" #define PRECISION float /*************************************************************************** * * * DATA PARALLEL BLAS based on MPL * * * * Internal routine, this routine is not supposed to be * * called by user programs. * * * * Version 1.0 1/4-92 , * * For MasPar MP-1 computers * * * * para//ab, University of Bergen, NORWAY * * * * The calling sequence may be changed in a future version. * * Please report any BUGs, ideas for improvement or other * * comments to * * adm@parallab.uib.no * * * * Future versions may then reflect your suggestions. * * The most current version of this software is available * * from netlib@nac.no , send the message `send index from maspar' * * * * REVISIONS: * * * ***************************************************************************/ /* This routine do the computational part of the rank1 update. Logic and "quick return" have taken place in the f90 part. Moving x and y into local variables too */ #ifdef __STDMPL__ visible void mpl_sger(int m, int n, PRECISION alpha, plural PRECISION *x_loc, plural PRECISION *y_loc, plural PRECISION *a) #else visible void mpl_sger(m,n,alpha,x_loc,y_loc,a) int m, n; PRECISION alpha; plural PRECISION *x_loc,*y_loc,*a; #endif { register plural PRECISION xtmp,ytmp,yglob; plural PRECISION *xglob; register plural int ip = iproc,ix = ixproc, iy = iyproc; register int i,j,ind,mbloc,nbloc; register int nx=nxproc,ny=nyproc; void free(); if (m>=nproc || n>=nproc){ printf("Can not handle matrices of this size \n "); printf("Computation continues without updating A \n"); } else{ mbloc = NBX(m); /* # non-empty blocks of x */ nbloc = NBY(n); /* # non-empty blocks of trans(y) */ /* In order to save on communication we have to store the blocks of the x-vector. This introduce extra use storage */ xglob =(plural PRECISION *) p_malloc(mbloc*sizeof(PRECISION)); /* Putting vectors into registers and zeroing on non-participating processors */ ytmp = *y_loc; xtmp = *x_loc; if (ip >= m) xtmp = 0.0; if (ip >= n) ytmp = 0.0; /* transpose y */ router[(ip>>lnyproc) + (ip%ny)*nx].ytmp = ytmp; /* spreading out x-values at the DPU */ xtmp *= alpha; for (i=0; i