|
ScaLAPACK
2.0.2
ScaLAPACK: Scalable Linear Algebra PACKage
|
00001 #include "redist.h" 00158 #define static2 static 00159 #if defined(Add_) || defined(f77IsF2C) 00160 #define fortran_mr2d pdtrmr2do_ 00161 #define fortran_mr2dnew pdtrmr2d_ 00162 #elif defined(UpCase) 00163 #define fortran_mr2dnew PDTRMR2D 00164 #define fortran_mr2d PDTRMR2DO 00165 #define dcopy_ DCOPY 00166 #define dlacpy_ DLACPY 00167 #else 00168 #define fortran_mr2d pdtrmr2do 00169 #define fortran_mr2dnew pdtrmr2d 00170 #define dcopy_ dcopy 00171 #define dlacpy_ dlacpy 00172 #endif 00173 #define Clacpy Cdtrlacpy 00174 void Clacpy(); 00175 typedef struct { 00176 int desctype; 00177 int ctxt; 00178 int m; 00179 int n; 00180 int nbrow; 00181 int nbcol; 00182 int sprow; 00183 int spcol; 00184 int lda; 00185 } MDESC; 00186 #define BLOCK_CYCLIC_2D 1 00187 typedef struct { 00188 int gstart; 00189 int len; 00190 } IDESC; 00191 #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) 00192 #define max(A,B) ((A)>(B)?(A):(B)) 00193 #define min(A,B) ((A)>(B)?(B):(A)) 00194 #define DIVUP(a,b) ( ((a)-1) /(b)+1) 00195 #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) 00196 #ifdef MALLOCDEBUG 00197 #define malloc mymalloc 00198 #define free myfree 00199 #define realloc myrealloc 00200 #endif 00201 /* Cblacs */ 00202 extern void Cblacs_pcoord(); 00203 extern int Cblacs_pnum(); 00204 extern void Csetpvmtids(); 00205 extern void Cblacs_get(); 00206 extern void Cblacs_pinfo(); 00207 extern void Cblacs_gridinfo(); 00208 extern void Cblacs_gridinit(); 00209 extern void Cblacs_exit(); 00210 extern void Cblacs_gridexit(); 00211 extern void Cblacs_setup(); 00212 extern void Cigebs2d(); 00213 extern void Cigebr2d(); 00214 extern void Cigesd2d(); 00215 extern void Cigerv2d(); 00216 extern void Cigsum2d(); 00217 extern void Cigamn2d(); 00218 extern void Cigamx2d(); 00219 extern void Cdgesd2d(); 00220 extern void Cdgerv2d(); 00221 /* lapack */ 00222 void dlacpy_(); 00223 /* aux fonctions */ 00224 extern int localindice(); 00225 extern void *mr2d_malloc(); 00226 extern int ppcm(); 00227 extern int localsize(); 00228 extern int memoryblocksize(); 00229 extern int changeorigin(); 00230 extern void paramcheck(); 00231 /* tools and others function */ 00232 #define scanD0 dtrscanD0 00233 #define dispmat dtrdispmat 00234 #define setmemory dtrsetmemory 00235 #define freememory dtrfreememory 00236 #define scan_intervals dtrscan_intervals 00237 extern void scanD0(); 00238 extern void dispmat(); 00239 extern void setmemory(); 00240 extern void freememory(); 00241 extern int scan_intervals(); 00242 extern void Cpdtrmr2do(); 00243 extern void Cpdtrmr2d(); 00244 /* some defines for Cpdtrmr2do */ 00245 #define SENDBUFF 0 00246 #define RECVBUFF 1 00247 #define SIZEBUFF 2 00248 #if 0 00249 #define DEBUG 00250 #endif 00251 #ifndef DEBUG 00252 #define NDEBUG 00253 #endif 00254 #include <stdio.h> 00255 #include <stdlib.h> 00256 #include <assert.h> 00257 #define DESCLEN 9 00258 void 00259 fortran_mr2d(uplo, diag, m, n, A, ia, ja, desc_A, 00260 B, ib, jb, desc_B) 00261 char *uplo, *diag; 00262 int *ia, *ib, *ja, *jb, *m, *n; 00263 int desc_A[DESCLEN], desc_B[DESCLEN]; 00264 double *A, *B; 00265 { 00266 Cpdtrmr2do(uplo, diag, *m, *n, A, *ia, *ja, (MDESC *) desc_A, 00267 B, *ib, *jb, (MDESC *) desc_B); 00268 return; 00269 } 00270 void 00271 fortran_mr2dnew(uplo, diag, m, n, A, ia, ja, desc_A, 00272 B, ib, jb, desc_B, gcontext) 00273 char *uplo, *diag; 00274 int *ia, *ib, *ja, *jb, *m, *n; 00275 int desc_A[DESCLEN], desc_B[DESCLEN]; 00276 double *A, *B; 00277 int *gcontext; 00278 { 00279 Cpdtrmr2d(uplo, diag, *m, *n, A, *ia, *ja, (MDESC *) desc_A, 00280 B, *ib, *jb, (MDESC *) desc_B, *gcontext); 00281 return; 00282 } 00283 static2 void init_chenille(); 00284 static2 int inter_len(); 00285 static2 int block2buff(); 00286 static2 void buff2block(); 00287 static2 void gridreshape(); 00288 void 00289 Cpdtrmr2do(uplo, diag, m, n, 00290 ptrmyblock, ia, ja, ma, 00291 ptrmynewblock, ib, jb, mb) 00292 char *uplo, *diag; 00293 double *ptrmyblock, *ptrmynewblock; 00294 /* pointers to the memory location of the matrix and the redistributed matrix */ 00295 MDESC *ma; 00296 MDESC *mb; 00297 int ia, ja, ib, jb, m, n; 00298 { 00299 int dummy, nprocs; 00300 int gcontext; 00301 /* first we initialize a global grid which serve as a reference to 00302 * communicate from grid a to grid b */ 00303 Cblacs_pinfo(&dummy, &nprocs); 00304 Cblacs_get(0, 0, &gcontext); 00305 Cblacs_gridinit(&gcontext, "R", 1, nprocs); 00306 Cpdtrmr2d(uplo, diag, m, n, ptrmyblock, ia, ja, ma, 00307 ptrmynewblock, ib, jb, mb, gcontext); 00308 Cblacs_gridexit(gcontext); 00309 } 00310 #define NBPARAM 20 /* p0,q0,p1,q1, puis ma,na,mba,nba,rowa,cola puis 00311 * idem B puis ia,ja puis ib,jb */ 00312 #define MAGIC_MAX 100000000 00313 void 00314 Cpdtrmr2d(uplo, diag, m, n, 00315 ptrmyblock, ia, ja, ma, 00316 ptrmynewblock, ib, jb, mb, globcontext) 00317 char *uplo, *diag; 00318 double *ptrmyblock, *ptrmynewblock; 00319 /* pointers to the memory location of the matrix and the redistributed matrix */ 00320 MDESC *ma; 00321 MDESC *mb; 00322 int ia, ja, ib, jb, m, n, globcontext; 00323 { 00324 double *ptrsendbuff, *ptrrecvbuff, *ptrNULL = 0; 00325 double *recvptr; 00326 MDESC newa, newb; 00327 int *proc0, *proc1, *param; 00328 int mypnum, myprow0, mypcol0, myprow1, mypcol1, nprocs; 00329 int i, j; 00330 int nprow, npcol, gcontext; 00331 int recvsize, sendsize; 00332 IDESC *h_inter; /* to store the horizontal intersections */ 00333 IDESC *v_inter; /* to store the vertical intersections */ 00334 int hinter_nb, vinter_nb; /* number of intrsections in both directions */ 00335 int dummy; 00336 int p0, q0, p1, q1; 00337 int *ra, *ca; 00338 /* end of variables */ 00339 /* To simplify further calcul we change the matrix indexation from 00340 * 1..m,1..n (fortran) to 0..m-1,0..n-1 */ 00341 if (m == 0 || n == 0) 00342 return; 00343 ia -= 1; 00344 ja -= 1; 00345 ib -= 1; 00346 jb -= 1; 00347 Cblacs_gridinfo(globcontext, &nprow, &npcol, &dummy, &mypnum); 00348 gcontext = globcontext; 00349 nprocs = nprow * npcol; 00350 /* if the global context that is given to us has not the shape of a line 00351 * (nprow != 1), create a new context. TODO: to be optimal, we should 00352 * avoid this because it is an uncessary synchronisation */ 00353 if (nprow != 1) { 00354 gridreshape(&gcontext); 00355 Cblacs_gridinfo(gcontext, &dummy, &dummy, &dummy, &mypnum); 00356 } 00357 Cblacs_gridinfo(ma->ctxt, &p0, &q0, &myprow0, &mypcol0); 00358 /* compatibility T3D, must check myprow and mypcol are within bounds */ 00359 if (myprow0 >= p0 || mypcol0 >= q0) 00360 myprow0 = mypcol0 = -1; 00361 assert((myprow0 < p0 && mypcol0 < q0) || (myprow0 == -1 && mypcol0 == -1)); 00362 Cblacs_gridinfo(mb->ctxt, &p1, &q1, &myprow1, &mypcol1); 00363 if (myprow1 >= p1 || mypcol1 >= q1) 00364 myprow1 = mypcol1 = -1; 00365 assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1)); 00366 /* exchange the missing parameters among the processors: shape of grids and 00367 * location of the processors */ 00368 param = (int *) mr2d_malloc(3 * (nprocs * 2 + NBPARAM) * sizeof(int)); 00369 ra = param + nprocs * 2 + NBPARAM; 00370 ca = param + (nprocs * 2 + NBPARAM) * 2; 00371 for (i = 0; i < nprocs * 2 + NBPARAM; i++) 00372 param[i] = MAGIC_MAX; 00373 proc0 = param + NBPARAM; 00374 proc1 = param + NBPARAM + nprocs; 00375 /* we calulate proc0 and proc1 that will give the number of a proc in 00376 * respectively a or b in the global context */ 00377 if (myprow0 >= 0) { 00378 proc0[myprow0 * q0 + mypcol0] = mypnum; 00379 param[0] = p0; 00380 param[1] = q0; 00381 param[4] = ma->m; 00382 param[5] = ma->n; 00383 param[6] = ma->nbrow; 00384 param[7] = ma->nbcol; 00385 param[8] = ma->sprow; 00386 param[9] = ma->spcol; 00387 param[10] = ia; 00388 param[11] = ja; 00389 } 00390 if (myprow1 >= 0) { 00391 proc1[myprow1 * q1 + mypcol1] = mypnum; 00392 param[2] = p1; 00393 param[3] = q1; 00394 param[12] = mb->m; 00395 param[13] = mb->n; 00396 param[14] = mb->nbrow; 00397 param[15] = mb->nbcol; 00398 param[16] = mb->sprow; 00399 param[17] = mb->spcol; 00400 param[18] = ib; 00401 param[19] = jb; 00402 } 00403 Cigamn2d(gcontext, "All", "H", 2 * nprocs + NBPARAM, 1, param, 2 * nprocs + NBPARAM, 00404 ra, ca, 2 * nprocs + NBPARAM, -1, -1); 00405 newa = *ma; 00406 newb = *mb; 00407 ma = &newa; 00408 mb = &newb; 00409 if (myprow0 == -1) { 00410 p0 = param[0]; 00411 q0 = param[1]; 00412 ma->m = param[4]; 00413 ma->n = param[5]; 00414 ma->nbrow = param[6]; 00415 ma->nbcol = param[7]; 00416 ma->sprow = param[8]; 00417 ma->spcol = param[9]; 00418 ia = param[10]; 00419 ja = param[11]; 00420 } 00421 if (myprow1 == -1) { 00422 p1 = param[2]; 00423 q1 = param[3]; 00424 mb->m = param[12]; 00425 mb->n = param[13]; 00426 mb->nbrow = param[14]; 00427 mb->nbcol = param[15]; 00428 mb->sprow = param[16]; 00429 mb->spcol = param[17]; 00430 ib = param[18]; 00431 jb = param[19]; 00432 } 00433 for (i = 0; i < NBPARAM; i++) { 00434 if (param[i] == MAGIC_MAX) { 00435 fprintf(stderr, "xxGEMR2D:something wrong in the parameters\n"); 00436 exit(1); 00437 } 00438 } 00439 #ifndef NDEBUG 00440 for (i = 0; i < p0 * q0; i++) 00441 assert(proc0[i] >= 0 && proc0[i] < nprocs); 00442 for (i = 0; i < p1 * q1; i++) 00443 assert(proc1[i] >= 0 && proc1[i] < nprocs); 00444 #endif 00445 /* check the validity of the parameters */ 00446 paramcheck(ma, ia, ja, m, n, p0, q0, gcontext); 00447 paramcheck(mb, ib, jb, m, n, p1, q1, gcontext); 00448 /* we change the problem so that ia < a->nbrow ... andia + m = a->m ... */ 00449 { 00450 int decal; 00451 ia = changeorigin(myprow0, ma->sprow, p0, 00452 ma->nbrow, ia, &decal, &ma->sprow); 00453 ptrmyblock += decal; 00454 ja = changeorigin(mypcol0, ma->spcol, q0, 00455 ma->nbcol, ja, &decal, &ma->spcol); 00456 ptrmyblock += decal * ma->lda; 00457 ma->m = ia + m; 00458 ma->n = ja + n; 00459 ib = changeorigin(myprow1, mb->sprow, p1, 00460 mb->nbrow, ib, &decal, &mb->sprow); 00461 ptrmynewblock += decal; 00462 jb = changeorigin(mypcol1, mb->spcol, q1, 00463 mb->nbcol, jb, &decal, &mb->spcol); 00464 ptrmynewblock += decal * mb->lda; 00465 mb->m = ib + m; 00466 mb->n = jb + n; 00467 if (p0 == 1) 00468 ma->nbrow = ma->m; 00469 if (q0 == 1) 00470 ma->nbcol = ma->n; 00471 if (p1 == 1) 00472 mb->nbrow = mb->m; 00473 if (q1 == 1) 00474 mb->nbcol = mb->n; 00475 #ifndef NDEBUG 00476 paramcheck(ma, ia, ja, m, n, p0, q0, gcontext); 00477 paramcheck(mb, ib, jb, m, n, p1, q1, gcontext); 00478 #endif 00479 } 00480 /* We compute the size of the memory buffer ( we choose the worst case, 00481 * when the buffer sizes == the memory block sizes). */ 00482 if (myprow0 >= 0 && mypcol0 >= 0) { 00483 /* Initialize pointer variables */ 00484 setmemory(&ptrsendbuff, memoryblocksize(ma)); 00485 }; /* if (mypnum < p0 * q0) */ 00486 if (myprow1 >= 0 && mypcol1 >= 0) { 00487 /* Initialize pointer variables */ 00488 setmemory(&ptrrecvbuff, memoryblocksize(mb)); 00489 }; /* if (mypnum < p1 * q1) */ 00490 /* allocing room for the tabs, alloc for the worst case,local_n or local_m 00491 * intervals, in fact the worst case should be less, perhaps half that,I 00492 * should think of that one day. */ 00493 h_inter = (IDESC *) mr2d_malloc(DIVUP(ma->n, q0 * ma->nbcol) * 00494 ma->nbcol * sizeof(IDESC)); 00495 v_inter = (IDESC *) mr2d_malloc(DIVUP(ma->m, p0 * ma->nbrow) 00496 * ma->nbrow * sizeof(IDESC)); 00497 /* We go for the scanning of indices. For each processor including mypnum, 00498 * we fill the sendbuff buffer (scanD0(SENDBUFF)) and when it is done send 00499 * it. Then for each processor, we compute the size of message to be 00500 * receive scanD0(SIZEBUFF)), post a receive and then allocate the elements 00501 * of recvbuff the right place (scanD)(RECVBUFF)) */ 00502 recvptr = ptrrecvbuff; 00503 { 00504 int tot, myrang, step, sens; 00505 int *sender, *recver; 00506 int mesending, merecving; 00507 tot = max(p0 * q0, p1 * q1); 00508 init_chenille(mypnum, nprocs, p0 * q0, proc0, p1 * q1, proc1, 00509 &sender, &recver, &myrang); 00510 if (myrang == -1) 00511 goto after_comm; 00512 mesending = myprow0 >= 0; 00513 assert(sender[myrang] >= 0 || !mesending); 00514 assert(!mesending || proc0[sender[myrang]] == mypnum); 00515 merecving = myprow1 >= 0; 00516 assert(recver[myrang] >= 0 || !merecving); 00517 assert(!merecving || proc1[recver[myrang]] == mypnum); 00518 step = tot - 1 - myrang; 00519 do { 00520 for (sens = 0; sens < 2; sens++) { 00521 /* be careful here, when we communicating with ourselves, we must 00522 * send first (myrang > step == 0) */ 00523 if (mesending && recver[step] >= 0 && 00524 (sens == 0)) { 00525 i = recver[step] / q1; 00526 j = recver[step] % q1; 00527 vinter_nb = scan_intervals('r', ia, ib, m, ma, mb, p0, p1, myprow0, i, 00528 v_inter); 00529 hinter_nb = scan_intervals('c', ja, jb, n, ma, mb, q0, q1, mypcol0, j, 00530 h_inter); 00531 scanD0(uplo, diag, SENDBUFF, ptrsendbuff, &sendsize, 00532 m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1, 00533 v_inter, vinter_nb, h_inter, hinter_nb, 00534 ptrmyblock); 00535 } /* if (mesending...) { */ 00536 if (mesending && recver[step] >= 0 && 00537 (sens == myrang > step)) { 00538 i = recver[step] / q1; 00539 j = recver[step] % q1; 00540 if (sendsize > 0 00541 && (step != myrang || !merecving) 00542 ) { 00543 Cdgesd2d(gcontext, sendsize, 1, ptrsendbuff, sendsize, 00544 0, proc1[i * q1 + j]); 00545 } /* sendsize > 0 */ 00546 } /* if (mesending ... */ 00547 if (merecving && sender[step] >= 0 && 00548 (sens == myrang <= step)) { 00549 i = sender[step] / q0; 00550 j = sender[step] % q0; 00551 vinter_nb = scan_intervals('r', ib, ia, m, mb, ma, p1, p0, myprow1, i, 00552 v_inter); 00553 hinter_nb = scan_intervals('c', jb, ja, n, mb, ma, q1, q0, mypcol1, j, 00554 h_inter); 00555 scanD0(uplo, diag, SIZEBUFF, ptrNULL, &recvsize, 00556 m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1, 00557 v_inter, vinter_nb, h_inter, hinter_nb, ptrNULL); 00558 if (recvsize > 0) { 00559 if (step == myrang && mesending) { 00560 Clacpy(recvsize, 1, 00561 ptrsendbuff, recvsize, 00562 ptrrecvbuff, recvsize); 00563 } else { 00564 Cdgerv2d(gcontext, recvsize, 1, ptrrecvbuff, recvsize, 00565 0, proc0[i * q0 + j]); 00566 } 00567 } /* recvsize > 0 */ 00568 } /* if (merecving ...) */ 00569 if (merecving && sender[step] >= 0 && sens == 1) { 00570 scanD0(uplo, diag, RECVBUFF, ptrrecvbuff, &recvsize, 00571 m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1, 00572 v_inter, vinter_nb, h_inter, hinter_nb, ptrmynewblock); 00573 } /* if (merecving...) */ 00574 } /* for (sens = 0) */ 00575 step -= 1; 00576 if (step < 0) 00577 step = tot - 1; 00578 } while (step != tot - 1 - myrang); 00579 after_comm: 00580 free(sender); 00581 } /* { int tot,nr,ns ...} */ 00582 /* don't forget to clean up things! */ 00583 if (myprow1 >= 0 && mypcol1 >= 0) { 00584 freememory((char *) ptrrecvbuff); 00585 }; 00586 if (myprow0 >= 0 && mypcol0 >= 0) { 00587 freememory((char *) ptrsendbuff); 00588 }; 00589 if (nprow != 1) 00590 Cblacs_gridexit(gcontext); 00591 free(v_inter); 00592 free(h_inter); 00593 free(param); 00594 }/* distrib */ 00595 static2 void 00596 init_chenille(mypnum, nprocs, n0, proc0, n1, proc1, psend, precv, myrang) 00597 int nprocs, mypnum, n0, n1; 00598 int *proc0, *proc1, **psend, **precv, *myrang; 00599 { 00600 int ns, nr, i, tot; 00601 int *sender, *recver, *g0, *g1; 00602 tot = max(n0, n1); 00603 sender = (int *) mr2d_malloc((nprocs + tot) * sizeof(int) * 2); 00604 recver = sender + tot; 00605 *psend = sender; 00606 *precv = recver; 00607 g0 = recver + tot; 00608 g1 = g0 + nprocs; 00609 for (i = 0; i < nprocs; i++) { 00610 g0[i] = -1; 00611 g1[i] = -1; 00612 } 00613 for (i = 0; i < tot; i++) { 00614 sender[i] = -1; 00615 recver[i] = -1; 00616 } 00617 for (i = 0; i < n0; i++) 00618 g0[proc0[i]] = i; 00619 for (i = 0; i < n1; i++) 00620 g1[proc1[i]] = i; 00621 ns = 0; 00622 nr = 0; 00623 *myrang = -1; 00624 for (i = 0; i < nprocs; i++) 00625 if (g0[i] >= 0 && g1[i] >= 0) { 00626 if (i == mypnum) 00627 *myrang = nr; 00628 sender[ns] = g0[i]; 00629 ns += 1; 00630 recver[nr] = g1[i]; 00631 nr += 1; 00632 assert(ns <= n0 && nr <= n1 && nr == ns); 00633 } 00634 for (i = 0; i < nprocs; i++) 00635 if (g0[i] >= 0 && g1[i] < 0) { 00636 if (i == mypnum) 00637 *myrang = ns; 00638 sender[ns] = g0[i]; 00639 ns += 1; 00640 assert(ns <= n0); 00641 } 00642 for (i = 0; i < nprocs; i++) 00643 if (g1[i] >= 0 && g0[i] < 0) { 00644 if (i == mypnum) 00645 *myrang = nr; 00646 recver[nr] = g1[i]; 00647 nr += 1; 00648 assert(nr <= n1); 00649 } 00650 } 00651 void 00652 Clacpy(m, n, a, lda, b, ldb) 00653 double *a, *b; 00654 int m, n, lda, ldb; 00655 { 00656 int i, j; 00657 lda -= m; 00658 ldb -= m; 00659 assert(lda >= 0 && ldb >= 0); 00660 for (j = 0; j < n; j++) { 00661 for (i = 0; i < m; i++) 00662 *b++ = *a++; 00663 b += ldb; 00664 a += lda; 00665 } 00666 } 00667 static2 void 00668 gridreshape(ctxtp) 00669 int *ctxtp; 00670 { 00671 int ori, final; /* original context, and new context created, with 00672 * line form */ 00673 int nprow, npcol, myrow, mycol; 00674 int *usermap; 00675 int i, j; 00676 ori = *ctxtp; 00677 Cblacs_gridinfo(ori, &nprow, &npcol, &myrow, &mycol); 00678 usermap = mr2d_malloc(sizeof(int) * nprow * npcol); 00679 for (i = 0; i < nprow; i++) 00680 for (j = 0; j < npcol; j++) { 00681 usermap[i + j * nprow] = Cblacs_pnum(ori, i, j); 00682 } 00683 /* Cblacs_get(0, 0, &final); */ 00684 Cblacs_get(ori, 10, &final); 00685 Cblacs_gridmap(&final, usermap, 1, 1, nprow * npcol); 00686 *ctxtp = final; 00687 free(usermap); 00688 }