#include "redist.h" /** $Id: pigemr.c,v 1.1.1.1 2000/02/15 18:04:08 susan Exp $ ------------------------------------------------------------------------ -- ScaLAPACK routine (version 1.7) -- Oak Ridge National Laboratory, Univ. of Tennessee, and Univ. of California, Berkeley. October 31, 1994. SUBROUTINE PIGEMR2D( M, N, $ A, IA, JA, ADESC, $ B, IB, JB, BDESC, $ CTXT) ------------------------------------------------------------------------ Purpose ======= PIGEMR2D copies a submatrix of A on a submatrix of B. A and B can have different distributions: they can be on different processor grids, they can have different blocksizes, the beginning of the area to be copied can be at a different places on A and B. The parameters can be confusing when the grids of A and B are partially or completly disjoint, in the case a processor calls this routines but is either not in the A context or B context, the ADESC[CTXT] or BDESC[CTXT] must be equal to -1, to ensure the routine recognise this situation. To summarize the rule: - If a processor is in A context, all parameters related to A must be valid. - If a processor is in B context, all parameters related to B must be valid. - ADESC[CTXT] and BDESC[CTXT] must be either valid contexts or equal to -1. - M and N must be valid for everyone. - other parameters are not examined. Notes ===== A description vector is associated with each 2D block-cyclicly dis- tributed matrix. This vector stores the information required to establish the mapping between a matrix entry and its corresponding process and memory location. In the following comments, the character _ should be read as "of the distributed matrix". Let A be a generic term for any 2D block cyclicly distributed matrix. Its description vector is DESC_A: NOTATION STORED IN EXPLANATION --------------- -------------- -------------------------------------- DT_A (global) DESCA( DT_ ) The descriptor type. CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating the BLACS process grid A is distribu- ted over. The context itself is glo- bal, but the handle (the integer value) may vary. M_A (global) DESCA( M_ ) The number of rows in the distributed matrix A. N_A (global) DESCA( N_ ) The number of columns in the distri- buted matrix A. MB_A (global) DESCA( MB_ ) The blocking factor used to distribute the rows of A. NB_A (global) DESCA( NB_ ) The blocking factor used to distribute the columns of A. RSRC_A (global) DESCA( RSRC_ ) The process row over which the first row of the matrix A is distributed. CSRC_A (global) DESCA( CSRC_ ) The process column over which the first column of A is distributed. LLD_A (local) DESCA( LLD_ ) The leading dimension of the local array storing the local blocks of the distributed matrix A. LLD_A >= MAX(1,LOCp(M_A)). Important notice ================ The parameters of the routine have changed in April 1996 There is a new last argument. It must be a context englobing all processors involved in the initial and final distribution. Be aware that all processors included in this context must call the redistribution routine. Parameters ========== M (input) INTEGER. On entry, M specifies the number of rows of the submatrix to be copied. M must be at least zero. Unchanged on exit. N (input) INTEGER. On entry, N specifies the number of cols of the submatrix to be redistributed.rows of B. M must be at least zero. Unchanged on exit. A (input) INTEGER On entry, the source matrix. Unchanged on exit. IA,JA (input) INTEGER On entry,the coordinates of the beginning of the submatrix of A to copy. 1 <= IA <= M_A - M + 1,1 <= JA <= N_A - N + 1, Unchanged on exit. ADESC (input) A description vector (see Notes above) If the current processor is not part of the context of A the ADESC[CTXT] must be equal to -1. B (output) INTEGER On entry, the destination matrix. The portion corresponding to the defined submatrix are updated. IB,JB (input) INTEGER On entry,the coordinates of the beginning of the submatrix of B that will be updated. 1 <= IB <= M_B - M + 1,1 <= JB <= N_B - N + 1, Unchanged on exit. BDESC (input) B description vector (see Notes above) For processors not part of the context of B BDESC[CTXT] must be equal to -1. CTXT (input) a context englobing at least all processors included in either A context or B context Memory requirement : ==================== for the processors belonging to grid 0, one buffer of size block 0 and for the processors belonging to grid 1, also one buffer of size block 1. ============================================================ Created March 1993 by B. Tourancheau (See sccs for modifications). Modifications by Loic PRYLLI 1995 ============================================================ */ #define static2 static #if defined(Add_) || defined(f77IsF2C) #define fortran_mr2d pigemr2do_ #define fortran_mr2dnew pigemr2d_ #elif defined(UpCase) #define fortran_mr2dnew PIGEMR2D #define fortran_mr2d PIGEMR2DO #define icopy_ ICOPY #define ilacpy_ ILACPY #else #define fortran_mr2d pigemr2do #define fortran_mr2dnew pigemr2d #define icopy_ icopy #define ilacpy_ ilacpy #endif #define Clacpy Cigelacpy void Clacpy(); typedef struct { int desctype; int ctxt; int m; int n; int nbrow; int nbcol; int sprow; int spcol; int lda; } MDESC; #define BLOCK_CYCLIC_2D 1 typedef struct { int lstart; int len; } IDESC; #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) #define max(A,B) ((A)>(B)?(A):(B)) #define min(A,B) ((A)>(B)?(B):(A)) #define DIVUP(a,b) ( ((a)-1) /(b)+1) #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) #ifdef MALLOCDEBUG #define malloc mymalloc #define free myfree #define realloc myrealloc #endif /* Cblacs */ extern void Cblacs_pcoord(); extern int Cblacs_pnum(); extern void Csetpvmtids(); extern void Cblacs_get(); extern void Cblacs_pinfo(); extern void Cblacs_gridinfo(); extern void Cblacs_gridinit(); extern void Cblacs_exit(); extern void Cblacs_gridexit(); extern void Cblacs_setup(); extern void Cigebs2d(); extern void Cigebr2d(); extern void Cigesd2d(); extern void Cigerv2d(); extern void Cigsum2d(); extern void Cigamn2d(); extern void Cigamx2d(); extern void Cigesd2d(); extern void Cigerv2d(); /* lapack */ void ilacpy_(); /* aux fonctions */ extern int localindice(); extern void *mr2d_malloc(); extern int ppcm(); extern int localsize(); extern int memoryblocksize(); extern int changeorigin(); extern void paramcheck(); /* tools and others function */ #define scanD0 igescanD0 #define dispmat igedispmat #define setmemory igesetmemory #define freememory igefreememory #define scan_intervals igescan_intervals extern void scanD0(); extern void dispmat(); extern void setmemory(); extern void freememory(); extern int scan_intervals(); extern void Cpigemr2do(); extern void Cpigemr2d(); /* some defines for Cpigemr2do */ #define SENDBUFF 0 #define RECVBUFF 1 #define SIZEBUFF 2 #if 0 #define DEBUG #endif #ifndef DEBUG #define NDEBUG #endif #include #include #include #define DESCLEN 9 void fortran_mr2d(m, n, A, ia, ja, desc_A, B, ib, jb, desc_B) int *ia, *ib, *ja, *jb, *m, *n; int desc_A[DESCLEN], desc_B[DESCLEN]; int *A, *B; { Cpigemr2do(*m, *n, A, *ia, *ja, (MDESC *) desc_A, B, *ib, *jb, (MDESC *) desc_B); return; } void fortran_mr2dnew(m, n, A, ia, ja, desc_A, B, ib, jb, desc_B, gcontext) int *ia, *ib, *ja, *jb, *m, *n; int desc_A[DESCLEN], desc_B[DESCLEN]; int *A, *B; int *gcontext; { Cpigemr2d(*m, *n, A, *ia, *ja, (MDESC *) desc_A, B, *ib, *jb, (MDESC *) desc_B, *gcontext); return; } static2 void init_chenille(); static2 int inter_len(); static2 int block2buff(); static2 void buff2block(); static2 void gridreshape(); void Cpigemr2do(m, n, ptrmyblock, ia, ja, ma, ptrmynewblock, ib, jb, mb) int *ptrmyblock, *ptrmynewblock; /* pointers to the memory location of the matrix and the redistributed matrix */ MDESC *ma; MDESC *mb; int ia, ja, ib, jb, m, n; { int dummy, nprocs; int gcontext; /* first we initialize a global grid which serve as a reference to * communicate from grid a to grid b */ Cblacs_pinfo(&dummy, &nprocs); Cblacs_get(0, 0, &gcontext); Cblacs_gridinit(&gcontext, "R", 1, nprocs); Cpigemr2d(m, n, ptrmyblock, ia, ja, ma, ptrmynewblock, ib, jb, mb, gcontext); Cblacs_gridexit(gcontext); } #define NBPARAM 20 /* p0,q0,p1,q1, puis ma,na,mba,nba,rowa,cola puis * idem B puis ia,ja puis ib,jb */ #define MAGIC_MAX 100000000 void Cpigemr2d(m, n, ptrmyblock, ia, ja, ma, ptrmynewblock, ib, jb, mb, globcontext) int *ptrmyblock, *ptrmynewblock; /* pointers to the memory location of the matrix and the redistributed matrix */ MDESC *ma; MDESC *mb; int ia, ja, ib, jb, m, n, globcontext; { int *ptrsendbuff, *ptrrecvbuff, *ptrNULL = 0; int *recvptr; MDESC newa, newb; int *proc0, *proc1, *param; int mypnum, myprow0, mypcol0, myprow1, mypcol1, nprocs; int i, j; int nprow, npcol, gcontext; int recvsize, sendsize; IDESC *h_inter; /* to store the horizontal intersections */ IDESC *v_inter; /* to store the vertical intersections */ int hinter_nb, vinter_nb; /* number of intrsections in both directions */ int dummy; int p0, q0, p1, q1; int *ra, *ca; /* end of variables */ /* To simplify further calcul we change the matrix indexation from * 1..m,1..n (fortran) to 0..m-1,0..n-1 */ if (m == 0 || n == 0) return; ia -= 1; ja -= 1; ib -= 1; jb -= 1; Cblacs_gridinfo(globcontext, &nprow, &npcol, &dummy, &mypnum); gcontext = globcontext; nprocs = nprow * npcol; /* if the global context that is given to us has not the shape of a line * (nprow != 1), create a new context. TODO: to be optimal, we should * avoid this because it is an uncessary synchronisation */ if (nprow != 1) { gridreshape(&gcontext); Cblacs_gridinfo(gcontext, &dummy, &dummy, &dummy, &mypnum); } Cblacs_gridinfo(ma->ctxt, &p0, &q0, &myprow0, &mypcol0); /* compatibility T3D, must check myprow and mypcol are within bounds */ if (myprow0 >= p0 || mypcol0 >= q0) myprow0 = mypcol0 = -1; assert((myprow0 < p0 && mypcol0 < q0) || (myprow0 == -1 && mypcol0 == -1)); Cblacs_gridinfo(mb->ctxt, &p1, &q1, &myprow1, &mypcol1); if (myprow1 >= p1 || mypcol1 >= q1) myprow1 = mypcol1 = -1; assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1)); /* exchange the missing parameters among the processors: shape of grids and * location of the processors */ param = (int *) mr2d_malloc(3 * (nprocs * 2 + NBPARAM) * sizeof(int)); ra = param + nprocs * 2 + NBPARAM; ca = param + (nprocs * 2 + NBPARAM) * 2; for (i = 0; i < nprocs * 2 + NBPARAM; i++) param[i] = MAGIC_MAX; proc0 = param + NBPARAM; proc1 = param + NBPARAM + nprocs; /* we calulate proc0 and proc1 that will give the number of a proc in * respectively a or b in the global context */ if (myprow0 >= 0) { proc0[myprow0 * q0 + mypcol0] = mypnum; param[0] = p0; param[1] = q0; param[4] = ma->m; param[5] = ma->n; param[6] = ma->nbrow; param[7] = ma->nbcol; param[8] = ma->sprow; param[9] = ma->spcol; param[10] = ia; param[11] = ja; } if (myprow1 >= 0) { proc1[myprow1 * q1 + mypcol1] = mypnum; param[2] = p1; param[3] = q1; param[12] = mb->m; param[13] = mb->n; param[14] = mb->nbrow; param[15] = mb->nbcol; param[16] = mb->sprow; param[17] = mb->spcol; param[18] = ib; param[19] = jb; } Cigamn2d(gcontext, "All", "H", 2 * nprocs + NBPARAM, 1, param, 2 * nprocs + NBPARAM, ra, ca, 2 * nprocs + NBPARAM, -1, -1); newa = *ma; newb = *mb; ma = &newa; mb = &newb; if (myprow0 == -1) { p0 = param[0]; q0 = param[1]; ma->m = param[4]; ma->n = param[5]; ma->nbrow = param[6]; ma->nbcol = param[7]; ma->sprow = param[8]; ma->spcol = param[9]; ia = param[10]; ja = param[11]; } if (myprow1 == -1) { p1 = param[2]; q1 = param[3]; mb->m = param[12]; mb->n = param[13]; mb->nbrow = param[14]; mb->nbcol = param[15]; mb->sprow = param[16]; mb->spcol = param[17]; ib = param[18]; jb = param[19]; } for (i = 0; i < NBPARAM; i++) { if (param[i] == MAGIC_MAX) { fprintf(stderr, "xxGEMR2D:something wrong in the parameters\n"); exit(1); } } #ifndef NDEBUG for (i = 0; i < p0 * q0; i++) assert(proc0[i] >= 0 && proc0[i] < nprocs); for (i = 0; i < p1 * q1; i++) assert(proc1[i] >= 0 && proc1[i] < nprocs); #endif /* check the validity of the parameters */ paramcheck(ma, ia, ja, m, n, p0, q0, gcontext); paramcheck(mb, ib, jb, m, n, p1, q1, gcontext); /* we change the problem so that ia < a->nbrow ... andia + m = a->m ... */ { int decal; ia = changeorigin(myprow0, ma->sprow, p0, ma->nbrow, ia, &decal, &ma->sprow); ptrmyblock += decal; ja = changeorigin(mypcol0, ma->spcol, q0, ma->nbcol, ja, &decal, &ma->spcol); ptrmyblock += decal * ma->lda; ma->m = ia + m; ma->n = ja + n; ib = changeorigin(myprow1, mb->sprow, p1, mb->nbrow, ib, &decal, &mb->sprow); ptrmynewblock += decal; jb = changeorigin(mypcol1, mb->spcol, q1, mb->nbcol, jb, &decal, &mb->spcol); ptrmynewblock += decal * mb->lda; mb->m = ib + m; mb->n = jb + n; if (p0 == 1) ma->nbrow = ma->m; if (q0 == 1) ma->nbcol = ma->n; if (p1 == 1) mb->nbrow = mb->m; if (q1 == 1) mb->nbcol = mb->n; #ifndef NDEBUG paramcheck(ma, ia, ja, m, n, p0, q0, gcontext); paramcheck(mb, ib, jb, m, n, p1, q1, gcontext); #endif } /* We compute the size of the memory buffer ( we choose the worst case, * when the buffer sizes == the memory block sizes). */ if (myprow0 >= 0 && mypcol0 >= 0) { /* Initialize pointer variables */ setmemory(&ptrsendbuff, memoryblocksize(ma)); }; /* if (mypnum < p0 * q0) */ if (myprow1 >= 0 && mypcol1 >= 0) { /* Initialize pointer variables */ setmemory(&ptrrecvbuff, memoryblocksize(mb)); }; /* if (mypnum < p1 * q1) */ /* allocing room for the tabs, alloc for the worst case,local_n or local_m * intervals, in fact the worst case should be less, perhaps half that,I * should think of that one day. */ h_inter = (IDESC *) mr2d_malloc(DIVUP(ma->n, q0 * ma->nbcol) * ma->nbcol * sizeof(IDESC)); v_inter = (IDESC *) mr2d_malloc(DIVUP(ma->m, p0 * ma->nbrow) * ma->nbrow * sizeof(IDESC)); /* We go for the scanning of indices. For each processor including mypnum, * we fill the sendbuff buffer (scanD0(SENDBUFF)) and when it is done send * it. Then for each processor, we compute the size of message to be * receive scanD0(SIZEBUFF)), post a receive and then allocate the elements * of recvbuff the right place (scanD)(RECVBUFF)) */ recvptr = ptrrecvbuff; { int tot, myrang, step, sens; int *sender, *recver; int mesending, merecving; tot = max(p0 * q0, p1 * q1); init_chenille(mypnum, nprocs, p0 * q0, proc0, p1 * q1, proc1, &sender, &recver, &myrang); if (myrang == -1) goto after_comm; mesending = myprow0 >= 0; assert(sender[myrang] >= 0 || !mesending); assert(!mesending || proc0[sender[myrang]] == mypnum); merecving = myprow1 >= 0; assert(recver[myrang] >= 0 || !merecving); assert(!merecving || proc1[recver[myrang]] == mypnum); step = tot - 1 - myrang; do { for (sens = 0; sens < 2; sens++) { /* be careful here, when we communicating with ourselves, we must * send first (myrang > step == 0) */ if (mesending && recver[step] >= 0 && (sens == 0)) { i = recver[step] / q1; j = recver[step] % q1; vinter_nb = scan_intervals('r', ia, ib, m, ma, mb, p0, p1, myprow0, i, v_inter); hinter_nb = scan_intervals('c', ja, jb, n, ma, mb, q0, q1, mypcol0, j, h_inter); sendsize = block2buff(v_inter, vinter_nb, h_inter, hinter_nb, ptrmyblock, ma, ptrsendbuff); } /* if (mesending...) { */ if (mesending && recver[step] >= 0 && (sens == myrang > step)) { i = recver[step] / q1; j = recver[step] % q1; if (sendsize > 0 && (step != myrang || !merecving) ) { Cigesd2d(gcontext, sendsize, 1, ptrsendbuff, sendsize, 0, proc1[i * q1 + j]); } /* sendsize > 0 */ } /* if (mesending ... */ if (merecving && sender[step] >= 0 && (sens == myrang <= step)) { i = sender[step] / q0; j = sender[step] % q0; vinter_nb = scan_intervals('r', ib, ia, m, mb, ma, p1, p0, myprow1, i, v_inter); hinter_nb = scan_intervals('c', jb, ja, n, mb, ma, q1, q0, mypcol1, j, h_inter); recvsize = inter_len(hinter_nb, h_inter, vinter_nb, v_inter); if (recvsize > 0) { if (step == myrang && mesending) { Clacpy(recvsize, 1, ptrsendbuff, recvsize, ptrrecvbuff, recvsize); } else { Cigerv2d(gcontext, recvsize, 1, ptrrecvbuff, recvsize, 0, proc0[i * q0 + j]); } } /* recvsize > 0 */ } /* if (merecving ...) */ if (merecving && sender[step] >= 0 && sens == 1) { buff2block(v_inter, vinter_nb, h_inter, hinter_nb, recvptr, ptrmynewblock, mb); } /* if (merecving...) */ } /* for (sens = 0) */ step -= 1; if (step < 0) step = tot - 1; } while (step != tot - 1 - myrang); after_comm: free(sender); } /* { int tot,nr,ns ...} */ /* don't forget to clean up things! */ if (myprow1 >= 0 && mypcol1 >= 0) { freememory((char *) ptrrecvbuff); }; if (myprow0 >= 0 && mypcol0 >= 0) { freememory((char *) ptrsendbuff); }; if (nprow != 1) Cblacs_gridexit(gcontext); free(v_inter); free(h_inter); free(param); }/* distrib */ static2 void init_chenille(mypnum, nprocs, n0, proc0, n1, proc1, psend, precv, myrang) int nprocs, mypnum, n0, n1; int *proc0, *proc1, **psend, **precv, *myrang; { int ns, nr, i, tot; int *sender, *recver, *g0, *g1; tot = max(n0, n1); sender = (int *) mr2d_malloc((nprocs + tot) * sizeof(int) * 2); recver = sender + tot; *psend = sender; *precv = recver; g0 = recver + tot; g1 = g0 + nprocs; for (i = 0; i < nprocs; i++) { g0[i] = -1; g1[i] = -1; } for (i = 0; i < tot; i++) { sender[i] = -1; recver[i] = -1; } for (i = 0; i < n0; i++) g0[proc0[i]] = i; for (i = 0; i < n1; i++) g1[proc1[i]] = i; ns = 0; nr = 0; *myrang = -1; for (i = 0; i < nprocs; i++) if (g0[i] >= 0 && g1[i] >= 0) { if (i == mypnum) *myrang = nr; sender[ns] = g0[i]; ns += 1; recver[nr] = g1[i]; nr += 1; assert(ns <= n0 && nr <= n1 && nr == ns); } for (i = 0; i < nprocs; i++) if (g0[i] >= 0 && g1[i] < 0) { if (i == mypnum) *myrang = ns; sender[ns] = g0[i]; ns += 1; assert(ns <= n0); } for (i = 0; i < nprocs; i++) if (g1[i] >= 0 && g0[i] < 0) { if (i == mypnum) *myrang = nr; recver[nr] = g1[i]; nr += 1; assert(nr <= n1); } } #define Mlacpy(mo,no,ao,ldao,bo,ldbo) \ { \ int *_a,*_b; \ int _m,_n,_lda,_ldb; \ int _i,_j; \ _m = (mo);_n = (no); \ _a = (ao);_b = (bo); \ _lda = (ldao) - _m; \ _ldb = (ldbo) - _m; \ assert(_lda >= 0 && _ldb >= 0); \ for (_j=0;_j<_n;_j++) { \ for (_i=0;_i<_m;_i++) \ *_b++ = *_a++; \ _b += _ldb; \ _a += _lda; \ } \ } static2 int block2buff(vi, vinb, hi, hinb, ptra, ma, buff) int hinb, vinb; IDESC *hi, *vi; MDESC *ma; int *buff, *ptra; { int h, v, sizebuff; int *ptr2; sizebuff = 0; for (h = 0; h < hinb; h++) { ptr2 = ptra + hi[h].lstart * ma->lda; for (v = 0; v < vinb; v++) { Mlacpy(vi[v].len, hi[h].len, ptr2 + vi[v].lstart, ma->lda, buff + sizebuff, vi[v].len); sizebuff += hi[h].len * vi[v].len; } } return sizebuff; } static2 void buff2block(vi, vinb, hi, hinb, buff, ptrb, mb) int hinb, vinb; IDESC *hi, *vi; MDESC *mb; int *buff, *ptrb; { int h, v, sizebuff; int *ptr2; sizebuff = 0; for (h = 0; h < hinb; h++) { ptr2 = ptrb + hi[h].lstart * mb->lda; for (v = 0; v < vinb; v++) { Mlacpy(vi[v].len, hi[h].len, buff + sizebuff, vi[v].len, ptr2 + vi[v].lstart, mb->lda); sizebuff += hi[h].len * vi[v].len; } } } static2 int inter_len(hinb, hi, vinb, vi) int hinb, vinb; IDESC *hi, *vi; { int hlen, vlen, h, v; hlen = 0; for (h = 0; h < hinb; h++) hlen += hi[h].len; vlen = 0; for (v = 0; v < vinb; v++) vlen += vi[v].len; return hlen * vlen; } void Clacpy(m, n, a, lda, b, ldb) int *a, *b; int m, n, lda, ldb; { int i, j; lda -= m; ldb -= m; assert(lda >= 0 && ldb >= 0); for (j = 0; j < n; j++) { for (i = 0; i < m; i++) *b++ = *a++; b += ldb; a += lda; } } static2 void gridreshape(ctxtp) int *ctxtp; { int ori, final; /* original context, and new context created, with * line form */ int nprow, npcol, myrow, mycol; int *usermap; int i, j; ori = *ctxtp; Cblacs_gridinfo(ori, &nprow, &npcol, &myrow, &mycol); usermap = mr2d_malloc(sizeof(int) * nprow * npcol); for (i = 0; i < nprow; i++) for (j = 0; j < npcol; j++) { usermap[i + j * nprow] = Cblacs_pnum(ori, i, j); } /* Cblacs_get(0, 0, &final); */ Cblacs_get(ori, 10, &final); Cblacs_gridmap(&final, usermap, 1, 1, nprow * npcol); *ctxtp = final; free(usermap); }