42{
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260 char Aroc, GemmTa, GemmTb, TrA, TrB, * one, * talpha, * tbeta,
261 top, * zero;
262 Int Abufld, AcurrocR, Afr, Afwd, AiD, AiR, AiiD, AiiR, AinbD,
263 AinbR, Ainb1D, Ainb1R, AisR, Akk, Ald, AmyprocD, AmyprocR,
264 AnbD, AnbR, AnpD, AnpR, AnprocsD, AnprocsR, Aoff, ArocD,
265 ArocR, AsrcR, Asrc_, Bcol, Bii, Bimb1, Binb1, Bjj, Bld, Bm,
266 Bmb, Bmp, Bn, Bnb, Bnq, Brow, Cbufld, Ccol, Ccurrow, Cfr,
267 Cfwd, Cii, Cimb, Cimb1, Cinb, Cinb1, CisR, Cjj, Ckk, Cld,
268 Cmb, Cmp, Cnb, Cnq, Coff, Crow, Csrc, WAfr, WCfr, WCsum,
269 ctxt, lcmb, m, maxp, maxpm1, maxq, mb, mbb, mycol, myrow,
270 ncpq, nota, notb, npcol, npq=0, nprow, nrpq, p=0, q=0, size,
271 tmp;
274
275
276
280 char * Abuf = NULL, * Bptr = NULL, * Cbuf = NULL, * WA = NULL,
281 * WC = NULL;
282
283
284
285
287
292
294 gemm =
TYPE->Fgemm; gsum2d =
TYPE->Cgsum2d;
296
297
298
299 if( nota )
300 {
301 AiD = JA; AiR = IA;
303 AinbR = DESCA[
IMB_ ]; AinbD = DESCA[
INB_];
304 AnbR = DESCA[
MB_ ]; AnbD = DESCA[
NB_ ];
305 AsrcR = DESCA[Asrc_]; Ald = DESCA[
LLD_];
306 AmyprocD = mycol; AnprocsD = npcol;
307 AmyprocR = myrow; AnprocsR = nprow;
308 PB_Cinfog2l( IA, JA, DESCA, AnprocsR, AnprocsD, AmyprocR, AmyprocD,
309 &AiiR, &AiiD, &ArocR, &ArocD );
310 }
311 else
312 {
313 AiD = IA; AiR = JA;
315 AinbD = DESCA[
IMB_ ]; AinbR = DESCA[
INB_];
316 AnbD = DESCA[
MB_ ]; AnbR = DESCA[
NB_ ];
317 AsrcR = DESCA[Asrc_]; Ald = DESCA[
LLD_];
318 AmyprocD = myrow; AnprocsD = nprow;
319 AmyprocR = mycol; AnprocsR = npcol;
320 PB_Cinfog2l( IA, JA, DESCA, AnprocsD, AnprocsR, AmyprocD, AmyprocR,
321 &AiiD, &AiiR, &ArocD, &ArocR );
322 }
324 AnpD =
PB_Cnumroc( K, 0, Ainb1D, AnbD, AmyprocD, ArocD, AnprocsD );
326
327 Cimb = DESCC[
IMB_ ]; Cinb = DESCC[
INB_];
328 Cmb = DESCC[
MB_ ]; Cnb = DESCC[
NB_ ];
330 PB_Cinfog2l( IC, JC, DESCC, nprow, npcol, myrow, mycol, &Cii, &Cjj,
331 &Crow, &Ccol );
334 Cnq =
PB_Cnumroc( N, 0, Cinb1, Cnb, mycol, Ccol, npcol );
335
336
337
338
339
340 if( notb )
341 {
342 Bm = K; Bn = N;
344 talpha = ALPHA; GemmTa = ( nota ?
CTRAN : TrA ); GemmTb =
CNOTRAN;
345 }
346 else
347 {
348 Bm = N; Bn = K;
351 {
354 }
355 else
356 {
357 talpha = ALPHA;
359 }
361 }
362
363
364
365 PB_Cdescribe( Bm, Bn, IB, JB, DESCB, nprow, npcol, myrow, mycol, &Bii, &Bjj,
366 &Bld, &Bimb1, &Binb1, &Bmb, &Bnb, &Brow, &Bcol, Bd0 );
367
368 Bmp =
PB_Cnumroc( Bm, 0, Bimb1, Bmb, myrow, Brow, nprow );
369 Bnq =
PB_Cnumroc( Bn, 0, Binb1, Bnb, mycol, Bcol, npcol );
370 if( ( Bmp > 0 ) && ( Bnq > 0 ) ) Bptr =
Mptr( B, Bii, Bjj, Bld, size );
371
372
373
374
375 if( !( AisR = ( ( AsrcR < 0 ) || ( AnprocsR == 1 ) ) ) && !Afwd )
376 {
377 tmp =
PB_Cindxg2p( M - 1, Ainb1R, AnbR, ArocR, ArocR, AnprocsR );
378 q =
MModSub( tmp, ArocR, AnprocsR );
379 }
380
381
382
383
384 if( !( CisR = ( ( Crow < 0 ) || ( nprow == 1 ) ) ) && !Cfwd )
385 {
386 tmp =
PB_Cindxg2p( M - 1, Cimb1, Cmb, Crow, Crow, nprow );
387 p =
MModSub( tmp, Crow, nprow );
388 }
389
390
391
392
393 lcmb =
PB_Clcm( ( maxp = ( CisR ? 1 : nprow ) ) * Cmb,
394 ( maxq = ( AisR ? 1 : AnprocsR ) ) * AnbR );
395 m = M;
396 maxpm1 = maxp - 1;
397
398 while( m > 0 )
399 {
400
401
402
403 AcurrocR = ( AisR ? -1 :
MModAdd( ArocR, q, AnprocsR ) );
404 Akk =
PB_Cg2lrem( AiR, AinbR, AnbR, AcurrocR, AsrcR, AnprocsR );
405 AnpR =
PB_Cnumroc( M, 0, Ainb1R, AnbR, AcurrocR, ArocR, AnprocsR );
406
407 Ccurrow = ( CisR ? -1 :
MModAdd( Crow, p, nprow ) );
408 Ckk =
PB_Cg2lrem( IC, Cimb, Cmb, Ccurrow, Csrc, nprow );
409 Cmp =
PB_Cnumroc( M, 0, Cimb1, Cmb, Ccurrow, Crow, nprow );
410
411 PB_CVMinit( &VM, 0, Cmp, AnpR, Cimb1, Ainb1R, Cmb, AnbR, p, q,
412 maxp, maxq, lcmb );
413
414
415
417
418 m -= npq;
419
420
421
422
423 if( npq ) mbb = npq / ( ( npq - 1 ) / mb + 1 );
424
425 while( npq )
426 {
427 mbb =
MIN( mbb, npq );
428
429
430
432
433 if( nota )
434 {
435
436
437
438
439 if( ( Afr = ( ncpq < mbb ) ) != 0 )
440 {
441
442
443
444
445 Abufld = mbb;
446 if( AisR || ( AmyprocR == AcurrocR ) )
447 {
450 AnpD, one,
Mptr( A, Akk, AiiD, Ald, size ), Ald,
451 zero, Abuf, Abufld );
452 }
453 }
454 else
455 {
456
457
458
459 Abufld = Ald;
460 if( AisR || ( AmyprocR == AcurrocR ) )
461 Abuf =
Mptr( A, Akk+Aoff, AiiD, Ald, size );
462 }
463 PB_Cdescset( DBUFA, mbb, K, mbb, Ainb1D, mbb, AnbD, AcurrocR,
464 ArocD, ctxt, Abufld );
465 }
466 else
467 {
468
469
470
471
472 if( ( Afr = ( ncpq < mbb ) ) != 0 )
473 {
474
475
476
477
478 Abufld =
MAX( 1, AnpD );
479 if( AisR || ( AmyprocR == AcurrocR ) )
480 {
483 AnpD, one,
Mptr( A, AiiD, Akk, Ald, size ), Ald,
484 zero, Abuf, Abufld );
485 }
486 }
487 else
488 {
489
490
491
492 Abufld = Ald;
493 if( AisR || ( AmyprocR == AcurrocR ) )
494 Abuf =
Mptr( A, AiiD, Akk+Aoff, Ald, size );
495 }
496 PB_Cdescset( DBUFA, K, mbb, Ainb1D, mbb, AnbD, mbb, ArocD,
497 AcurrocR, ctxt, Abufld );
498 }
499
500 if( notb )
501 {
502
503
504
505 PB_CInV(
TYPE,
NOCONJG,
COLUMN, Bm, Bn, Bd0, mbb, Abuf, 0, 0,
506 DBUFA, &Aroc, &WA, WAd, &WAfr );
507
508
509
510 PB_COutV(
TYPE,
ROW,
INIT, Bm, Bn, Bd0, mbb, &WC, WCd, &WCfr,
511 &WCsum );
512
513
514
515 if( Bmp > 0 && Bnq > 0 )
517 talpha, WA, &WAd[
LLD_], Bptr, &Bld, zero, WC, &WCd[
LLD_] );
518 if( WAfr ) free( WA );
519 if( Afr && ( AisR || ( AmyprocR == AcurrocR ) ) )
520 if( Abuf ) free( Abuf );
521
522
523
524 if( WCsum )
525 {
526 WCd[
RSRC_] = Ccurrow;
527 if( Bnq > 0 )
528 gsum2d( ctxt,
COLUMN, &top, mbb, Bnq, WC, WCd[
LLD_],
530 }
531
532
533
534
535 if( ( Cfr = ( nrpq < mbb ) ) != 0 )
536 {
537
538
539
540 Cbufld = mbb; tbeta = zero;
541 if( CisR || ( myrow == Ccurrow ) )
543 }
544 else
545 {
546
547
548
549 Cbufld = Cld; tbeta = BETA;
550 if( CisR || ( myrow == Ccurrow ) )
551 Cbuf =
Mptr( C, Ckk+Coff, Cjj, Cld, size );
552 }
553 PB_Cdescset( DBUFC, mbb, N, mbb, Cinb1, mbb, Cnb, Ccurrow, Ccol,
554 ctxt, Cbufld );
555
556
557
558 PB_Cpaxpby(
TYPE,
NOCONJG, mbb, N, one, WC, 0, 0, WCd,
ROW, tbeta,
559 Cbuf, 0, 0, DBUFC,
ROW );
560
561
562
563 if( Cfr && ( CisR || ( myrow == Ccurrow ) ) )
564 {
566 BETA,
Mptr( C, Ckk, Cjj, Cld, size ), Cld, one, Cbuf,
567 Cbufld );
568 if( Cbuf ) free( Cbuf );
569 }
570 if( WCfr ) free( WC );
571 }
572 else
573 {
574
575
576
577 PB_CInV(
TYPE,
NOCONJG,
ROW, Bm, Bn, Bd0, mbb, Abuf, 0, 0,
578 DBUFA, &Aroc, &WA, WAd, &WAfr );
579
580
581
582 PB_COutV(
TYPE,
COLUMN,
INIT, Bm, Bn, Bd0, mbb, &WC, WCd, &WCfr,
583 &WCsum );
584
585
586
587 if( Bmp > 0 && Bnq > 0 )
589 talpha, Bptr, &Bld, WA, &WAd[
LLD_], zero, WC, &WCd[
LLD_] );
590 if( WAfr ) free( WA );
591 if( Afr && ( AisR || ( AmyprocR == AcurrocR ) ) )
592 if( Abuf ) free( Abuf );
593
594
595
596 if( WCsum )
597 {
599 if( Bmp > 0 )
600 gsum2d( ctxt,
ROW, &top, Bmp, mbb, WC, WCd[
LLD_], myrow,
602 }
603
604
605
606
607 if( ( Cfr = ( nrpq < mbb ) ) != 0 )
608 {
609
610
611
612 Cbufld = mbb; tbeta = zero;
613 if( CisR || ( myrow == Ccurrow ) )
615 }
616 else
617 {
618
619
620
621 Cbufld = Cld; tbeta = BETA;
622 if( CisR || ( myrow == Ccurrow ) )
623 Cbuf =
Mptr( C, Ckk+Coff, Cjj, Cld, size );
624 }
625 PB_Cdescset( DBUFC, mbb, N, mbb, Cinb1, mbb, Cnb, Ccurrow, Ccol,
626 ctxt, Cbufld );
627
628
629
631 one, WC, 0, 0, WCd,
COLUMN, tbeta, Cbuf, 0, 0, DBUFC,
633
634
635
636 if( Cfr && ( CisR || ( myrow == Ccurrow ) ) )
637 {
639 BETA,
Mptr( C, Ckk, Cjj, Cld, size ), Cld, one, Cbuf,
640 Cbufld );
641 if( Cbuf ) free( Cbuf );
642 }
643 if( WCfr ) free( WC );
644 }
645
646
647
649
650 npq -= mbb;
651 }
652
653
654
655 if( ( Cfwd && ( p == maxpm1 ) ) ||
656 ( !( Cfwd ) && ( p == 0 ) ) )
659 }
660
661 if( TrB ==
CCOTRAN ) free( talpha );
662
663
664
665}