41{
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282 char GatherDir, ScatterDir, * one, top, tran, * zero;
283 Int Afr, An, Bcol, Bcurcol, Bcurimb1, Bcurinb1, Bcurrow, Bfr, Bii,
284 Bimb, Bimb1, Binb, Binb1, Bjj, Bld, Bmb, Bmp, Bmp0, Bnb, Bnq,
285 Bnq0, Brow, Ccol, Ccurcol, Ccurimb1, Ccurinb1, Ccurrow, Cii,
286 Cimb, Cimb1, Cinb, Cinb1, Cjj, Cld, Cmb, Cmp, Cmp0, Cnb, Cnq,
287 Cnq0, Crow, WABfr, WACfr, WBCfr, WBCsum, conjg, ctxt, fwd, k,
288 kb, kbb, kend, kstart, kstep, ktmp, lside, mycol, myrow,
289 npcol, nprow, size, upper;
292
293
294
297 char * Aptr = NULL, * Bptr = NULL, * Bptr0 = NULL, * Cptr0 = NULL,
298 * WAB = NULL, * WAC = NULL, * WBC = NULL;
299
300
301
302
303
304
305
307
308
309
311
312 An = ( ( lside = (
Mupcase( SIDE[0] ) ==
CLEFT ) ) ? M : N );
315
317 gemm =
TYPE->Fgemm; gsum2d =
TYPE->Cgsum2d;
318
319
320
323 {
324 kstart = 0; kend = ( ( An - 1 ) / kb + 1 ) * kb; kstep = kb;
326 }
327 else
328 {
329 kstart = ( ( An - 1 ) / kb ) * kb; kend = kstep = -kb;
331 }
332
333
334
335 PB_Cinfog2l( IB, JB, DESCB, nprow, npcol, myrow, mycol, &Bii, &Bjj,
336 &Brow, &Bcol );
337 Bimb = DESCB[
IMB_]; Binb = DESCB[
INB_];
338 Bmb = DESCB[
MB_ ]; Bnb = DESCB[
NB_ ]; Bld = DESCB[
LLD_];
340 Bmp0 =
PB_Cnumroc( M, 0, Bimb1, Bmb, myrow, Brow, nprow );
342 Bnq0 =
PB_Cnumroc( N, 0, Binb1, Bnb, mycol, Bcol, npcol );
343 if( ( Bmp0 > 0 ) && ( Bnq0 > 0 ) ) Bptr0 =
Mptr( B, Bii, Bjj, Bld, size );
344
345 PB_Cinfog2l( IC, JC, DESCC, nprow, npcol, myrow, mycol, &Cii, &Cjj,
346 &Crow, &Ccol );
347 Cimb = DESCC[
IMB_]; Cinb = DESCC[
INB_];
348 Cmb = DESCC[
MB_ ]; Cnb = DESCC[
NB_ ]; Cld = DESCC[
LLD_];
350 Cmp0 =
PB_Cnumroc( M, 0, Cimb1, Cmb, myrow, Crow, nprow );
352 Cnq0 =
PB_Cnumroc( N, 0, Cinb1, Cnb, mycol, Ccol, npcol );
353 if( ( Cmp0 > 0 ) && ( Cnq0 > 0 ) ) Cptr0 =
Mptr( C, Cii, Cjj, Cld, size );
354
355 if( lside )
356 {
358
359 if( upper )
360 {
361 for( k = kstart; k != kend; k += kstep )
362 {
363 kbb = An - k; kbb =
MIN( kbb, kb ); ktmp = k + kbb;
364
365
366
368 DESCA,
COLUMN, &Aptr, DBUFA, &Afr );
369
370
371
372
373 PB_Cdescset( Cd0, ktmp, N, Cimb1, Cinb1, Cmb, Cnb, Crow, Ccol,
374 ctxt, Cld );
375 PB_CInV(
TYPE,
NOCONJG,
COLUMN, ktmp, N, Cd0, kbb, Aptr, 0, 0,
376 DBUFA,
COLUMN, &WAC, WACd, &WACfr );
377
378
379
380 if( conjg )
382 zero, WAC, k, 0, WACd );
383 else if( kbb > 1 )
385 zero, WAC, k+1, 0, WACd );
386
387
388
390 ROW, &Bptr, DBUFB, &Bfr );
391
392
393
394 PB_CInV(
TYPE,
NOCONJG,
ROW, ktmp, N, Cd0, kbb, Bptr, 0, 0, DBUFB,
395 ROW, &WBC, WBCd, &WBCfr );
396
397
398
399 Cmp =
PB_Cnumroc( ktmp, 0, Cimb1, Cmb, myrow, Crow, nprow );
400 if( ( Cmp > 0 ) && ( Cnq0 > 0 ) )
402 ALPHA, WAC, &WACd[
LLD_], WBC, &WBCd[
LLD_], one, Cptr0,
403 &Cld );
404 if( WBCfr ) free( WBC );
405 if( Bfr ) free( Bptr );
406
407
408
409
410 PB_Cdescset( Bd0, ktmp, N, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
411 ctxt, Bld );
412 PB_CInV(
TYPE,
NOCONJG,
COLUMN, ktmp, N, Bd0, kbb, WAC, 0, 0, WACd,
413 COLUMN, &WAB, WABd, &WABfr );
414
415
416
417 PB_Cplapad(
TYPE,
LOWER,
NOCONJG, kbb, kbb, zero, zero, WAB, k, 0,
418 WABd );
419
420
421
422 PB_COutV(
TYPE,
ROW,
INIT, ktmp, N, Bd0, kbb, &WBC, WBCd, &WBCfr,
423 &WBCsum );
424 Bmp =
PB_Cnumroc( ktmp, 0, Bimb1, Bmb, myrow, Brow, nprow );
425 if( ( Bnq0 > 0 ) && ( Bmp > 0 ) )
427 ALPHA, WAB, &WABd[
LLD_], Bptr0, &Bld, zero, WBC,
429 if( WABfr ) free( WAB );
430 if( WACfr ) free( WAC );
431 if( Afr ) free( Aptr );
432
433 if( WBCsum )
434 {
436 Cmb, Crow, Crow, nprow );
437 if( Bnq0 > 0 )
438 gsum2d( ctxt,
COLUMN, &top, kbb, Bnq0, WBC, WBCd[
LLD_],
439 WBCd[
RSRC_], mycol );
440 }
441
442
443
444 PB_CScatterV(
TYPE, &ScatterDir, kbb, N, WBC, 0, 0, WBCd,
ROW, one,
445 C, IC+k, JC, DESCC,
ROW );
446 if( WBCfr ) free( WBC );
447 }
448 }
449 else
450 {
451 for( k = kstart; k != kend; k += kstep )
452 {
453 ktmp = An - k; kbb =
MIN( ktmp, kb );
454
455
456
458 DESCA,
COLUMN, &Aptr, DBUFA, &Afr );
459
460
461
463 Ccurrow =
PB_Cindxg2p( k, Cimb1, Cmb, Crow, Crow, nprow );
464 PB_Cdescset( Cd0, ktmp, N, Ccurimb1, Cinb1, Cmb, Cnb, Ccurrow,
465 Ccol, ctxt, Cld );
466 PB_CInV(
TYPE,
NOCONJG,
COLUMN, ktmp, N, Cd0, kbb, Aptr, 0, 0,
467 DBUFA,
COLUMN, &WAC, WACd, &WACfr );
468
469
470
471 if( conjg )
473 0, 0, WACd );
474 else if( kbb > 1 )
476 0, 1, WACd );
477
478
479
481 ROW, &Bptr, DBUFB, &Bfr );
482
483
484
485 PB_CInV(
TYPE,
NOCONJG,
ROW, ktmp, N, Cd0, kbb, Bptr, 0, 0, DBUFB,
486 ROW, &WBC, WBCd, &WBCfr );
487
488
489
490 Cmp =
PB_Cnumroc( ktmp, k, Cimb1, Cmb, myrow, Crow, nprow );
491 if( ( Cmp > 0 ) && ( Cnq0 > 0 ) )
493 ALPHA, WAC, &WACd[
LLD_], WBC, &WBCd[
LLD_], one,
494 Mptr( Cptr0, Cmp0-Cmp, 0, Cld, size ), &Cld );
495 if( WBCfr ) free( WBC );
496 if( Bfr ) free( Bptr );
497
498
499
500
502 Bcurrow =
PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow, nprow );
503 PB_Cdescset( Bd0, ktmp, N, Bcurimb1, Binb1, Bmb, Bnb, Bcurrow,
504 Bcol, ctxt, Bld );
505 PB_CInV(
TYPE,
NOCONJG,
COLUMN, ktmp, N, Bd0, kbb, WAC, 0, 0, WACd,
506 COLUMN, &WAB, WABd, &WABfr );
507
508
509
510 PB_Cplapad(
TYPE,
UPPER,
NOCONJG, kbb, kbb, zero, zero, WAB, 0, 0,
511 WABd );
512
513
514
515 PB_COutV(
TYPE,
ROW,
INIT, ktmp, N, Bd0, kbb, &WBC, WBCd, &WBCfr,
516 &WBCsum );
517 Bmp =
PB_Cnumroc( ktmp, k, Bimb1, Bmb, myrow, Brow, nprow );
518 if( ( Bnq0 > 0 ) && ( Bmp > 0 ) )
520 ALPHA, WAB, &WABd[
LLD_],
Mptr( Bptr0, Bmp0-Bmp, 0, Bld,
521 size ), &Bld, zero, WBC, &WBCd[
LLD_] );
522 if( WABfr ) free( WAB );
523 if( WACfr ) free( WAC );
524 if( Afr ) free( Aptr );
525
526 if( WBCsum )
527 {
529 Cmb, Crow, Crow, nprow );
530 if( Bnq0 > 0 )
531 gsum2d( ctxt,
COLUMN, &top, kbb, Bnq0, WBC, WBCd[
LLD_],
532 WBCd[
RSRC_], mycol );
533 }
534
535
536
537 PB_CScatterV(
TYPE, &ScatterDir, kbb, N, WBC, 0, 0, WBCd,
ROW, one,
538 C, IC+k, JC, DESCC,
ROW );
539 if( WBCfr ) free( WBC );
540 }
541 }
542 }
543 else
544 {
546
547 if( upper )
548 {
549 for( k = kstart; k != kend; k += kstep )
550 {
551 ktmp = An - k; kbb =
MIN( ktmp, kb );
552
553
554
556 DESCA,
ROW, &Aptr, DBUFA, &Afr );
557
558
559
561 Ccurcol =
PB_Cindxg2p( k, Cinb1, Cnb, Ccol, Ccol, npcol );
562 PB_Cdescset( Cd0, M, ktmp, Cimb1, Ccurinb1, Cmb, Cnb, Crow, Ccurcol,
563 ctxt, Cld );
564 PB_CInV(
TYPE,
NOCONJG,
ROW, M, ktmp, Cd0, kbb, Aptr, 0, 0, DBUFA,
565 ROW, &WAC, WACd, &WACfr );
566
567
568
569 if( conjg )
571 zero, WAC, 0, 0, WACd );
572 else if( kbb > 1 )
574 zero, WAC, 1, 0, WACd );
575
576
577
579 COLUMN, &Bptr, DBUFB, &Bfr );
580
581
582
583 PB_CInV(
TYPE,
NOCONJG,
COLUMN, M, ktmp, Cd0, kbb, Bptr, 0, 0,
584 DBUFB,
COLUMN, &WBC, WBCd, &WBCfr );
585
586
587
588 Cnq =
PB_Cnumroc( ktmp, k, Cinb1, Cnb, mycol, Ccol, npcol );
589 if( ( Cmp0 > 0 ) && ( Cnq > 0 ) )
591 ALPHA, WBC, &WBCd[
LLD_], WAC, &WACd[
LLD_], one,
592 Mptr( Cptr0, 0, Cnq0-Cnq, Cld, size ), &Cld );
593 if( WBCfr ) free( WBC );
594 if( Bfr ) free( Bptr );
595
596
597
598
600 Bcurcol =
PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol, npcol );
601 PB_Cdescset( Bd0, M, ktmp, Bimb1, Bcurinb1, Bmb, Bnb, Brow, Bcurcol,
602 ctxt, Bld );
603 PB_CInV(
TYPE,
NOCONJG,
ROW, M, ktmp, Bd0, kbb, WAC, 0, 0, WACd,
604 ROW, &WAB, WABd, &WABfr );
605
606
607
608 PB_Cplapad(
TYPE,
LOWER,
NOCONJG, kbb, kbb, zero, zero, WAB, 0, 0,
609 WABd );
610
611
612
614 &WBCfr, &WBCsum );
615 Bnq =
PB_Cnumroc( ktmp, k, Binb1, Bnb, mycol, Bcol, npcol );
616 if( ( Bmp0 > 0 ) && ( Bnq > 0 ) )
618 ALPHA,
Mptr( Bptr0, 0, Bnq0-Bnq, Bld, size ), &Bld, WAB,
619 &WABd[
LLD_], zero, WBC, &WBCd[
LLD_] );
620 if( WABfr ) free( WAB );
621 if( WACfr ) free( WAC );
622 if( Afr ) free( Aptr );
623
624 if( WBCsum )
625 {
627 Cnb, Ccol, Ccol, npcol );
628 if( Bmp0 > 0 )
629 gsum2d( ctxt,
ROW, &top, Bmp0, kbb, WBC, WBCd[
LLD_], myrow,
631 }
632
633
634
636 one, C, IC, JC+k, DESCC,
COLUMN );
637 if( WBCfr ) free( WBC );
638 }
639 }
640 else
641 {
642 for( k = kstart; k != kend; k += kstep )
643 {
644 kbb = An - k; kbb =
MIN( kbb, kb ); ktmp = k + kbb;
645
646
647
649 DESCA,
ROW, &Aptr, DBUFA, &Afr );
650
651
652
653
654 PB_Cdescset( Cd0, M, ktmp, Cimb1, Cinb1, Cmb, Cnb, Crow, Ccol, ctxt,
655 Cld );
656 PB_CInV(
TYPE,
NOCONJG,
ROW, M, ktmp, Cd0, kbb, Aptr, 0, 0, DBUFA,
657 ROW, &WAC, WACd, &WACfr );
658
659
660
661 if( conjg )
663 0, k, WACd );
664 else if( kbb > 1 )
666 0, k+1, WACd );
667
668
669
671 COLUMN, &Bptr, DBUFB, &Bfr );
672
673
674
675 PB_CInV(
TYPE,
NOCONJG,
COLUMN, M, ktmp, Cd0, kbb, Bptr, 0, 0,
676 DBUFB,
COLUMN, &WBC, WBCd, &WBCfr );
677
678
679
680 Cnq =
PB_Cnumroc( ktmp, 0, Cinb1, Cnb, mycol, Ccol, npcol );
681 if( ( Cmp0 > 0 ) && ( Cnq > 0 ) )
683 ALPHA, WBC, &WBCd[
LLD_], WAC, &WACd[
LLD_], one, Cptr0,
684 &Cld );
685 if( WBCfr ) free( WBC );
686 if( Bfr ) free( Bptr );
687
688
689
690
691 PB_Cdescset( Bd0, M, ktmp, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol, ctxt,
692 Bld );
693 PB_CInV(
TYPE,
NOCONJG,
ROW, M, ktmp, Bd0, kbb, WAC, 0, 0, WACd,
694 ROW, &WAB, WABd, &WABfr );
695
696
697
698 PB_Cplapad(
TYPE,
UPPER,
NOCONJG, kbb, kbb, zero, zero, WAB, 0, k,
699 WABd );
700
701
702
703 PB_COutV(
TYPE,
COLUMN,
INIT, M, ktmp, Bd0, kbb, &WBC, WBCd, &WBCfr,
704 &WBCsum );
705 Bnq =
PB_Cnumroc( ktmp, 0, Binb1, Bnb, mycol, Bcol, npcol );
706 if( ( Bmp0 > 0 ) && ( Bnq > 0 ) )
708 ALPHA, Bptr0, &Bld, WAB, &WABd[
LLD_], zero, WBC,
710 if( WABfr ) free( WAB );
711 if( WACfr ) free( WAC );
712 if( Afr ) free( Aptr );
713
714 if( WBCsum )
715 {
717 Cnb, Ccol, Ccol, npcol );
718 if( Bmp0 > 0 )
719 gsum2d( ctxt,
ROW, &top, Bmp0, kbb, WBC, WBCd[
LLD_], myrow,
721 }
722
723
724
726 one, C, IC, JC+k, DESCC,
COLUMN );
727 if( WBCfr ) free( WBC );
728 }
729 }
730 }
731
732
733
734}