ScaLAPACK  2.0.2
ScaLAPACK: Scalable Linear Algebra PACKage
pztrmr.c
Go to the documentation of this file.
00001 #include "redist.h"
00158 #define static2 static
00159 #if defined(Add_) || defined(f77IsF2C)
00160 #define fortran_mr2d pztrmr2do_
00161 #define fortran_mr2dnew pztrmr2d_
00162 #elif defined(UpCase)
00163 #define fortran_mr2dnew PZTRMR2D
00164 #define fortran_mr2d PZTRMR2DO
00165 #define zcopy_ ZCOPY
00166 #define zlacpy_ ZLACPY
00167 #else
00168 #define fortran_mr2d pztrmr2do
00169 #define fortran_mr2dnew pztrmr2d
00170 #define zcopy_ zcopy
00171 #define zlacpy_ zlacpy
00172 #endif
00173 #define Clacpy Cztrlacpy
00174 void  Clacpy();
00175 typedef struct {
00176   double r, i;
00177 }     dcomplex;
00178 typedef struct {
00179   int   desctype;
00180   int   ctxt;
00181   int   m;
00182   int   n;
00183   int   nbrow;
00184   int   nbcol;
00185   int   sprow;
00186   int   spcol;
00187   int   lda;
00188 }     MDESC;
00189 #define BLOCK_CYCLIC_2D 1
00190 typedef struct {
00191   int   gstart;
00192   int   len;
00193 }     IDESC;
00194 #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow)))
00195 #define max(A,B) ((A)>(B)?(A):(B))
00196 #define min(A,B) ((A)>(B)?(B):(A))
00197 #define DIVUP(a,b) ( ((a)-1) /(b)+1)
00198 #define ROUNDUP(a,b) (DIVUP(a,b)*(b))
00199 #ifdef MALLOCDEBUG
00200 #define malloc mymalloc
00201 #define free myfree
00202 #define realloc myrealloc
00203 #endif
00204 /* Cblacs */
00205 extern void Cblacs_pcoord();
00206 extern int Cblacs_pnum();
00207 extern void Csetpvmtids();
00208 extern void Cblacs_get();
00209 extern void Cblacs_pinfo();
00210 extern void Cblacs_gridinfo();
00211 extern void Cblacs_gridinit();
00212 extern void Cblacs_exit();
00213 extern void Cblacs_gridexit();
00214 extern void Cblacs_setup();
00215 extern void Cigebs2d();
00216 extern void Cigebr2d();
00217 extern void Cigesd2d();
00218 extern void Cigerv2d();
00219 extern void Cigsum2d();
00220 extern void Cigamn2d();
00221 extern void Cigamx2d();
00222 extern void Czgesd2d();
00223 extern void Czgerv2d();
00224 /* lapack */
00225 void  zlacpy_();
00226 /* aux fonctions */
00227 extern int localindice();
00228 extern void *mr2d_malloc();
00229 extern int ppcm();
00230 extern int localsize();
00231 extern int memoryblocksize();
00232 extern int changeorigin();
00233 extern void paramcheck();
00234 /* tools and others function */
00235 #define scanD0 ztrscanD0
00236 #define dispmat ztrdispmat
00237 #define setmemory ztrsetmemory
00238 #define freememory ztrfreememory
00239 #define scan_intervals ztrscan_intervals
00240 extern void scanD0();
00241 extern void dispmat();
00242 extern void setmemory();
00243 extern void freememory();
00244 extern int scan_intervals();
00245 extern void Cpztrmr2do();
00246 extern void Cpztrmr2d();
00247 /* some defines for Cpztrmr2do */
00248 #define SENDBUFF 0
00249 #define RECVBUFF 1
00250 #define SIZEBUFF 2
00251 #if 0
00252 #define DEBUG
00253 #endif
00254 #ifndef DEBUG
00255 #define NDEBUG
00256 #endif
00257 #include <stdio.h>
00258 #include <stdlib.h>
00259 #include <assert.h>
00260 #define DESCLEN 9
00261 void 
00262 fortran_mr2d(uplo, diag, m, n, A, ia, ja, desc_A,
00263              B, ib, jb, desc_B)
00264   char *uplo, *diag;
00265   int  *ia, *ib, *ja, *jb, *m, *n;
00266   int   desc_A[DESCLEN], desc_B[DESCLEN];
00267   dcomplex *A, *B;
00268 {
00269   Cpztrmr2do(uplo, diag, *m, *n, A, *ia, *ja, (MDESC *) desc_A,
00270              B, *ib, *jb, (MDESC *) desc_B);
00271   return;
00272 }
00273 void 
00274 fortran_mr2dnew(uplo, diag, m, n, A, ia, ja, desc_A,
00275                 B, ib, jb, desc_B, gcontext)
00276   char *uplo, *diag;
00277   int  *ia, *ib, *ja, *jb, *m, *n;
00278   int   desc_A[DESCLEN], desc_B[DESCLEN];
00279   dcomplex *A, *B;
00280   int  *gcontext;
00281 {
00282   Cpztrmr2d(uplo, diag, *m, *n, A, *ia, *ja, (MDESC *) desc_A,
00283             B, *ib, *jb, (MDESC *) desc_B, *gcontext);
00284   return;
00285 }
00286 static2 void init_chenille();
00287 static2 int inter_len();
00288 static2 int block2buff();
00289 static2 void buff2block();
00290 static2 void gridreshape();
00291 void
00292 Cpztrmr2do(uplo, diag, m, n,
00293            ptrmyblock, ia, ja, ma,
00294            ptrmynewblock, ib, jb, mb)
00295   char *uplo, *diag;
00296   dcomplex *ptrmyblock, *ptrmynewblock;
00297 /* pointers to the memory location of the matrix and the redistributed matrix */
00298   MDESC *ma;
00299   MDESC *mb;
00300   int   ia, ja, ib, jb, m, n;
00301 {
00302   int   dummy, nprocs;
00303   int   gcontext;
00304   /* first we initialize a global grid which serve as a reference to
00305    * communicate from grid a to grid b */
00306   Cblacs_pinfo(&dummy, &nprocs);
00307   Cblacs_get(0, 0, &gcontext);
00308   Cblacs_gridinit(&gcontext, "R", 1, nprocs);
00309   Cpztrmr2d(uplo, diag, m, n, ptrmyblock, ia, ja, ma,
00310             ptrmynewblock, ib, jb, mb, gcontext);
00311   Cblacs_gridexit(gcontext);
00312 }
00313 #define NBPARAM 20      /* p0,q0,p1,q1, puis ma,na,mba,nba,rowa,cola puis
00314                          * idem B puis ia,ja puis ib,jb */
00315 #define MAGIC_MAX 100000000
00316 void
00317 Cpztrmr2d(uplo, diag, m, n,
00318           ptrmyblock, ia, ja, ma,
00319           ptrmynewblock, ib, jb, mb, globcontext)
00320   char *uplo, *diag;
00321   dcomplex *ptrmyblock, *ptrmynewblock;
00322 /* pointers to the memory location of the matrix and the redistributed matrix */
00323   MDESC *ma;
00324   MDESC *mb;
00325   int   ia, ja, ib, jb, m, n, globcontext;
00326 {
00327   dcomplex *ptrsendbuff, *ptrrecvbuff, *ptrNULL = 0;
00328   dcomplex *recvptr;
00329   MDESC newa, newb;
00330   int  *proc0, *proc1, *param;
00331   int   mypnum, myprow0, mypcol0, myprow1, mypcol1, nprocs;
00332   int   i, j;
00333   int   nprow, npcol, gcontext;
00334   int   recvsize, sendsize;
00335   IDESC *h_inter;       /* to store the horizontal intersections */
00336   IDESC *v_inter;       /* to store the vertical intersections */
00337   int   hinter_nb, vinter_nb;   /* number of intrsections in both directions */
00338   int   dummy;
00339   int   p0, q0, p1, q1;
00340   int  *ra, *ca;
00341   /* end of variables */
00342   /* To simplify further calcul we change the matrix indexation from
00343    * 1..m,1..n (fortran) to 0..m-1,0..n-1 */
00344   if (m == 0 || n == 0)
00345     return;
00346   ia -= 1;
00347   ja -= 1;
00348   ib -= 1;
00349   jb -= 1;
00350   Cblacs_gridinfo(globcontext, &nprow, &npcol, &dummy, &mypnum);
00351   gcontext = globcontext;
00352   nprocs = nprow * npcol;
00353   /* if the global context that is given to us has not the shape of a line
00354    * (nprow != 1), create a new context.  TODO: to be optimal, we should
00355    * avoid this because it is an uncessary synchronisation */
00356   if (nprow != 1) {
00357     gridreshape(&gcontext);
00358     Cblacs_gridinfo(gcontext, &dummy, &dummy, &dummy, &mypnum);
00359   }
00360   Cblacs_gridinfo(ma->ctxt, &p0, &q0, &myprow0, &mypcol0);
00361   /* compatibility T3D, must check myprow  and mypcol are within bounds */
00362   if (myprow0 >= p0 || mypcol0 >= q0)
00363     myprow0 = mypcol0 = -1;
00364   assert((myprow0 < p0 && mypcol0 < q0) || (myprow0 == -1 && mypcol0 == -1));
00365   Cblacs_gridinfo(mb->ctxt, &p1, &q1, &myprow1, &mypcol1);
00366   if (myprow1 >= p1 || mypcol1 >= q1)
00367     myprow1 = mypcol1 = -1;
00368   assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1));
00369   /* exchange the missing parameters among the processors: shape of grids and
00370    * location of the processors */
00371   param = (int *) mr2d_malloc(3 * (nprocs * 2 + NBPARAM) * sizeof(int));
00372   ra = param + nprocs * 2 + NBPARAM;
00373   ca = param + (nprocs * 2 + NBPARAM) * 2;
00374   for (i = 0; i < nprocs * 2 + NBPARAM; i++)
00375     param[i] = MAGIC_MAX;
00376   proc0 = param + NBPARAM;
00377   proc1 = param + NBPARAM + nprocs;
00378   /* we calulate proc0 and proc1 that will give the number of a proc in
00379    * respectively a or b in the global context */
00380   if (myprow0 >= 0) {
00381     proc0[myprow0 * q0 + mypcol0] = mypnum;
00382     param[0] = p0;
00383     param[1] = q0;
00384     param[4] = ma->m;
00385     param[5] = ma->n;
00386     param[6] = ma->nbrow;
00387     param[7] = ma->nbcol;
00388     param[8] = ma->sprow;
00389     param[9] = ma->spcol;
00390     param[10] = ia;
00391     param[11] = ja;
00392   }
00393   if (myprow1 >= 0) {
00394     proc1[myprow1 * q1 + mypcol1] = mypnum;
00395     param[2] = p1;
00396     param[3] = q1;
00397     param[12] = mb->m;
00398     param[13] = mb->n;
00399     param[14] = mb->nbrow;
00400     param[15] = mb->nbcol;
00401     param[16] = mb->sprow;
00402     param[17] = mb->spcol;
00403     param[18] = ib;
00404     param[19] = jb;
00405   }
00406   Cigamn2d(gcontext, "All", "H", 2 * nprocs + NBPARAM, 1, param, 2 * nprocs + NBPARAM,
00407            ra, ca, 2 * nprocs + NBPARAM, -1, -1);
00408   newa = *ma;
00409   newb = *mb;
00410   ma = &newa;
00411   mb = &newb;
00412   if (myprow0 == -1) {
00413     p0 = param[0];
00414     q0 = param[1];
00415     ma->m = param[4];
00416     ma->n = param[5];
00417     ma->nbrow = param[6];
00418     ma->nbcol = param[7];
00419     ma->sprow = param[8];
00420     ma->spcol = param[9];
00421     ia = param[10];
00422     ja = param[11];
00423   }
00424   if (myprow1 == -1) {
00425     p1 = param[2];
00426     q1 = param[3];
00427     mb->m = param[12];
00428     mb->n = param[13];
00429     mb->nbrow = param[14];
00430     mb->nbcol = param[15];
00431     mb->sprow = param[16];
00432     mb->spcol = param[17];
00433     ib = param[18];
00434     jb = param[19];
00435   }
00436   for (i = 0; i < NBPARAM; i++) {
00437     if (param[i] == MAGIC_MAX) {
00438       fprintf(stderr, "xxGEMR2D:something wrong in the parameters\n");
00439       exit(1);
00440     }
00441   }
00442 #ifndef NDEBUG
00443   for (i = 0; i < p0 * q0; i++)
00444     assert(proc0[i] >= 0 && proc0[i] < nprocs);
00445   for (i = 0; i < p1 * q1; i++)
00446     assert(proc1[i] >= 0 && proc1[i] < nprocs);
00447 #endif
00448   /* check the validity of the parameters */
00449   paramcheck(ma, ia, ja, m, n, p0, q0, gcontext);
00450   paramcheck(mb, ib, jb, m, n, p1, q1, gcontext);
00451   /* we change the problem so that ia < a->nbrow ... andia + m = a->m ... */
00452   {
00453     int   decal;
00454     ia = changeorigin(myprow0, ma->sprow, p0,
00455                       ma->nbrow, ia, &decal, &ma->sprow);
00456     ptrmyblock += decal;
00457     ja = changeorigin(mypcol0, ma->spcol, q0,
00458                       ma->nbcol, ja, &decal, &ma->spcol);
00459     ptrmyblock += decal * ma->lda;
00460     ma->m = ia + m;
00461     ma->n = ja + n;
00462     ib = changeorigin(myprow1, mb->sprow, p1,
00463                       mb->nbrow, ib, &decal, &mb->sprow);
00464     ptrmynewblock += decal;
00465     jb = changeorigin(mypcol1, mb->spcol, q1,
00466                       mb->nbcol, jb, &decal, &mb->spcol);
00467     ptrmynewblock += decal * mb->lda;
00468     mb->m = ib + m;
00469     mb->n = jb + n;
00470     if (p0 == 1)
00471       ma->nbrow = ma->m;
00472     if (q0 == 1)
00473       ma->nbcol = ma->n;
00474     if (p1 == 1)
00475       mb->nbrow = mb->m;
00476     if (q1 == 1)
00477       mb->nbcol = mb->n;
00478 #ifndef NDEBUG
00479     paramcheck(ma, ia, ja, m, n, p0, q0, gcontext);
00480     paramcheck(mb, ib, jb, m, n, p1, q1, gcontext);
00481 #endif
00482   }
00483   /* We compute the size of the memory buffer ( we choose the worst case,
00484    * when the buffer sizes == the memory block sizes). */
00485   if (myprow0 >= 0 && mypcol0 >= 0) {
00486     /* Initialize pointer variables */
00487     setmemory(&ptrsendbuff, memoryblocksize(ma));
00488   };    /* if (mypnum < p0 * q0) */
00489   if (myprow1 >= 0 && mypcol1 >= 0) {
00490     /* Initialize pointer variables */
00491     setmemory(&ptrrecvbuff, memoryblocksize(mb));
00492   };    /* if (mypnum < p1 * q1) */
00493   /* allocing room for the tabs, alloc for the worst case,local_n or local_m
00494    * intervals, in fact the worst case should be less, perhaps half that,I
00495    * should think of that one day. */
00496   h_inter = (IDESC *) mr2d_malloc(DIVUP(ma->n, q0 * ma->nbcol) *
00497                                   ma->nbcol * sizeof(IDESC));
00498   v_inter = (IDESC *) mr2d_malloc(DIVUP(ma->m, p0 * ma->nbrow)
00499                                   * ma->nbrow * sizeof(IDESC));
00500   /* We go for the scanning of indices. For each processor including mypnum,
00501    * we fill the sendbuff buffer (scanD0(SENDBUFF)) and when it is done send
00502    * it. Then for each processor, we compute the size of message to be
00503    * receive scanD0(SIZEBUFF)), post a receive and then allocate the elements
00504    * of recvbuff the right place (scanD)(RECVBUFF)) */
00505   recvptr = ptrrecvbuff;
00506   {
00507     int   tot, myrang, step, sens;
00508     int  *sender, *recver;
00509     int   mesending, merecving;
00510     tot = max(p0 * q0, p1 * q1);
00511     init_chenille(mypnum, nprocs, p0 * q0, proc0, p1 * q1, proc1,
00512                   &sender, &recver, &myrang);
00513     if (myrang == -1)
00514       goto after_comm;
00515     mesending = myprow0 >= 0;
00516     assert(sender[myrang] >= 0 || !mesending);
00517     assert(!mesending || proc0[sender[myrang]] == mypnum);
00518     merecving = myprow1 >= 0;
00519     assert(recver[myrang] >= 0 || !merecving);
00520     assert(!merecving || proc1[recver[myrang]] == mypnum);
00521     step = tot - 1 - myrang;
00522     do {
00523       for (sens = 0; sens < 2; sens++) {
00524         /* be careful here, when we communicating with ourselves, we must
00525          * send first (myrang > step == 0) */
00526         if (mesending && recver[step] >= 0 &&
00527             (sens == 0)) {
00528           i = recver[step] / q1;
00529           j = recver[step] % q1;
00530           vinter_nb = scan_intervals('r', ia, ib, m, ma, mb, p0, p1, myprow0, i,
00531                                      v_inter);
00532           hinter_nb = scan_intervals('c', ja, jb, n, ma, mb, q0, q1, mypcol0, j,
00533                                      h_inter);
00534           scanD0(uplo, diag, SENDBUFF, ptrsendbuff, &sendsize,
00535                  m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1,
00536                  v_inter, vinter_nb, h_inter, hinter_nb,
00537                  ptrmyblock);
00538         }       /* if (mesending...) { */
00539         if (mesending && recver[step] >= 0 &&
00540             (sens == myrang > step)) {
00541           i = recver[step] / q1;
00542           j = recver[step] % q1;
00543           if (sendsize > 0
00544               && (step != myrang || !merecving)
00545                 ) {
00546             Czgesd2d(gcontext, sendsize, 1, ptrsendbuff, sendsize,
00547                      0, proc1[i * q1 + j]);
00548           }     /* sendsize > 0 */
00549         }       /* if (mesending ... */
00550         if (merecving && sender[step] >= 0 &&
00551             (sens == myrang <= step)) {
00552           i = sender[step] / q0;
00553           j = sender[step] % q0;
00554           vinter_nb = scan_intervals('r', ib, ia, m, mb, ma, p1, p0, myprow1, i,
00555                                      v_inter);
00556           hinter_nb = scan_intervals('c', jb, ja, n, mb, ma, q1, q0, mypcol1, j,
00557                                      h_inter);
00558           scanD0(uplo, diag, SIZEBUFF, ptrNULL, &recvsize,
00559                  m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1,
00560                  v_inter, vinter_nb, h_inter, hinter_nb, ptrNULL);
00561           if (recvsize > 0) {
00562             if (step == myrang && mesending) {
00563               Clacpy(recvsize, 1,
00564                      ptrsendbuff, recvsize,
00565                      ptrrecvbuff, recvsize);
00566             } else {
00567               Czgerv2d(gcontext, recvsize, 1, ptrrecvbuff, recvsize,
00568                        0, proc0[i * q0 + j]);
00569             }
00570           }     /* recvsize > 0 */
00571         }       /* if (merecving ...) */
00572         if (merecving && sender[step] >= 0 && sens == 1) {
00573           scanD0(uplo, diag, RECVBUFF, ptrrecvbuff, &recvsize,
00574                  m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1,
00575                  v_inter, vinter_nb, h_inter, hinter_nb, ptrmynewblock);
00576         }       /* if (merecving...)  */
00577       } /* for (sens = 0) */
00578       step -= 1;
00579       if (step < 0)
00580         step = tot - 1;
00581     } while (step != tot - 1 - myrang);
00582 after_comm:
00583     free(sender);
00584   }     /* { int tot,nr,ns ...} */
00585   /* don't forget to clean up things! */
00586   if (myprow1 >= 0 && mypcol1 >= 0) {
00587     freememory((char *) ptrrecvbuff);
00588   };
00589   if (myprow0 >= 0 && mypcol0 >= 0) {
00590     freememory((char *) ptrsendbuff);
00591   };
00592   if (nprow != 1)
00593     Cblacs_gridexit(gcontext);
00594   free(v_inter);
00595   free(h_inter);
00596   free(param);
00597 }/* distrib */
00598 static2 void 
00599 init_chenille(mypnum, nprocs, n0, proc0, n1, proc1, psend, precv, myrang)
00600   int   nprocs, mypnum, n0, n1;
00601   int  *proc0, *proc1, **psend, **precv, *myrang;
00602 {
00603   int   ns, nr, i, tot;
00604   int  *sender, *recver, *g0, *g1;
00605   tot = max(n0, n1);
00606   sender = (int *) mr2d_malloc((nprocs + tot) * sizeof(int) * 2);
00607   recver = sender + tot;
00608   *psend = sender;
00609   *precv = recver;
00610   g0 = recver + tot;
00611   g1 = g0 + nprocs;
00612   for (i = 0; i < nprocs; i++) {
00613     g0[i] = -1;
00614     g1[i] = -1;
00615   }
00616   for (i = 0; i < tot; i++) {
00617     sender[i] = -1;
00618     recver[i] = -1;
00619   }
00620   for (i = 0; i < n0; i++)
00621     g0[proc0[i]] = i;
00622   for (i = 0; i < n1; i++)
00623     g1[proc1[i]] = i;
00624   ns = 0;
00625   nr = 0;
00626   *myrang = -1;
00627   for (i = 0; i < nprocs; i++)
00628     if (g0[i] >= 0 && g1[i] >= 0) {
00629       if (i == mypnum)
00630         *myrang = nr;
00631       sender[ns] = g0[i];
00632       ns += 1;
00633       recver[nr] = g1[i];
00634       nr += 1;
00635       assert(ns <= n0 && nr <= n1 && nr == ns);
00636     }
00637   for (i = 0; i < nprocs; i++)
00638     if (g0[i] >= 0 && g1[i] < 0) {
00639       if (i == mypnum)
00640         *myrang = ns;
00641       sender[ns] = g0[i];
00642       ns += 1;
00643       assert(ns <= n0);
00644     }
00645   for (i = 0; i < nprocs; i++)
00646     if (g1[i] >= 0 && g0[i] < 0) {
00647       if (i == mypnum)
00648         *myrang = nr;
00649       recver[nr] = g1[i];
00650       nr += 1;
00651       assert(nr <= n1);
00652     }
00653 }
00654 void 
00655 Clacpy(m, n, a, lda, b, ldb)
00656   dcomplex *a, *b;
00657   int   m, n, lda, ldb;
00658 {
00659   int   i, j;
00660   lda -= m;
00661   ldb -= m;
00662   assert(lda >= 0 && ldb >= 0);
00663   for (j = 0; j < n; j++) {
00664     for (i = 0; i < m; i++)
00665       *b++ = *a++;
00666     b += ldb;
00667     a += lda;
00668   }
00669 }
00670 static2 void 
00671 gridreshape(ctxtp)
00672   int  *ctxtp;
00673 {
00674   int   ori, final;     /* original context, and new context created, with
00675                          * line form */
00676   int   nprow, npcol, myrow, mycol;
00677   int  *usermap;
00678   int   i, j;
00679   ori = *ctxtp;
00680   Cblacs_gridinfo(ori, &nprow, &npcol, &myrow, &mycol);
00681   usermap = mr2d_malloc(sizeof(int) * nprow * npcol);
00682   for (i = 0; i < nprow; i++)
00683     for (j = 0; j < npcol; j++) {
00684       usermap[i + j * nprow] = Cblacs_pnum(ori, i, j);
00685     }
00686   /* Cblacs_get(0, 0, &final); */
00687   Cblacs_get(ori, 10, &final);
00688   Cblacs_gridmap(&final, usermap, 1, 1, nprow * npcol);
00689   *ctxtp = final;
00690   free(usermap);
00691 }