C This program times Fortran variants of the Level 1 BLAS C operations AXPY and DOT, the Level 2 BLAS routine DGEMV, C and the level 3 BLAS routine DGEMM. C C It is very simple-minded. C C It was intended for the IBM RS/6000-530, but may C provide useful information on similar systems. C C A DOUBLE PRECISION FUNCTION SECOND is used for timing. You C will need to implement this on your system. C C If your cache is not 64K, change IWORDS to be the number C of 64bit words which can fit in cache. Also, change C NACTUAL so that a matrix of size NACTUAL by NACTUAL C comfortably fits in cache. C C Change AVMFLP and AVTIME to be typical values of the megaflop C rate you expect, and the number of repetitions of each C timing is adjusted so that AVTIME seconds should be C accumulated. C C J. Dongarra, P. Mayes, G. Radicati C Program grnerates data for a paper entitled: C ``The IBM RS/6000 and Linear Algebra Operations'', to appear. C C .. Parameters .. INTEGER IWORDS, IMAX1, IMAX2, MAXN, LDA PARAMETER (IWORDS=8192,IMAX1=IWORDS/8,IMAX2=IWORDS*2, + MAXN=128,LDA=MAXN+16) C .. Local Scalars .. DOUBLE PRECISION AVMFLP, AVTIME, OPS, S0, S1, S2, T1, T11, T12, + T2, T21, T22, T3, T4, T5, T6, T7, T8, TEMP, + TFLUSH, TIME INTEGER I, IMAX, IREP, J, K, KOUNT, NACTUAL C .. Local Arrays .. C DOUBLE PRECISION A(LDA,MAXN), B(LDA,MAXN), B1(16,IMAX2), + B2(16,IMAX2), C(LDA,MAXN), X(IMAX2), Y(IMAX2) C .. External Functions .. DOUBLE PRECISION SECOND EXTERNAL SECOND C .. External Subroutines .. EXTERNAL FLUSH, INIT C .. Intrinsic Functions .. INTRINSIC MAX, DBLE, NINT C .. Executable Statements .. AVTIME = 10.0D0 AVMFLP = 10.0D0 C C First compute the time for a call to FLUSH C S0 = SECOND() S1 = SECOND() DO 20 I = 1, 10000 CALL FLUSH(IWORDS) 20 CONTINUE S2 = SECOND() TFLUSH = (S2-S1) - (S1-S0) TFLUSH = TFLUSH*1.0D-4 C C Initialize the data to values which hopefully won't C cause overflow problems. C CALL INIT(A,LDA,MAXN) CALL INIT(B,LDA,MAXN) CALL INIT(C,LDA,MAXN) CALL INIT(B1,16,IMAX) CALL INIT(B2,16,IMAX) CALL INIT(X,IMAX2,1) CALL INIT(Y,IMAX2,1) NACTUAL = 48 C C DOT: Both vectors in cache C OPS = 2*DBLE(IMAX1) IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6))) S0 = SECOND() S1 = SECOND() TEMP = 0.0D0 DO 60 KOUNT = 1, IREP DO 40 I = 1, IMAX1 TEMP = TEMP + X(I)*Y(I) 40 CONTINUE 60 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DOT: Both vectors in cache' C C AXPY: Both vectors in cache C OPS = 2*DBLE(IMAX1) IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6))) S0 = SECOND() S1 = SECOND() TEMP = 0.0D0 DO 100 KOUNT = 1, IREP DO 80 I = 1, IMAX1 Y(I) = Y(I) + TEMP*X(I) 80 CONTINUE 100 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'AXPY: Both vectors in cache' C C DOT: both from memory with stride 1 C OPS = 2*DBLE(IMAX2) IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6))) S0 = SECOND() S1 = SECOND() DO 140 KOUNT = 1, IREP TEMP = 0.0D0 DO 120 I = 1, IMAX2 TEMP = TEMP + X(I)*Y(I) 120 CONTINUE 140 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DOT: Both from memory with stride 1' C C DOT: both from memory, x with stride 1, y with stride 16 C OPS = 2*DBLE(IMAX2) IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6))) S0 = SECOND() S1 = SECOND() DO 180 KOUNT = 1, IREP TEMP = 0.0D0 DO 160 I = 1, IMAX2 TEMP = TEMP + X(I)*B1(1,I) 160 CONTINUE 180 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DOT: Both from memory, x with stride 1, y with stride 16' C C DOT: both from memory, both with stride 16 C OPS = 2*DBLE(IMAX2) IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6))) S0 = SECOND() S1 = SECOND() DO 220 KOUNT = 1, IREP TEMP = 0.0D0 DO 200 I = 1, IMAX2 TEMP = TEMP + B1(1,I)*B2(1,I) 200 CONTINUE 220 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DOT: Both from memory, both with stride 16' C C AXPY: Both from memory with stride 1 C OPS = 2*DBLE(IMAX2) IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6))) S0 = SECOND() S1 = SECOND() TEMP = 0.0D0 DO 260 KOUNT = 1, IREP DO 240 I = 1, IMAX2 Y(I) = Y(I) + TEMP*X(I) 240 CONTINUE 260 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'AXPY: Both from memory with stride 1' C C AXPY: Both from memory, y with stride 1, x with stride 16 C OPS = 2*DBLE(IMAX2) IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6))) S0 = SECOND() S1 = SECOND() DO 300 KOUNT = 1, IREP DO 280 I = 1, IMAX2 Y(I) = Y(I) + TEMP*B1(1,I) 280 CONTINUE 300 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'AXPY: Both from memory, y with stride 1, x with stride 16' C C AXPY: Both from memory, y with stride 16, x with stride 1 C OPS = 2*DBLE(IMAX2) IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6))) S0 = SECOND() S1 = SECOND() DO 340 KOUNT = 1, IREP DO 320 I = 1, IMAX2 B1(1,I) = B1(1,I) + TEMP*X(I) 320 CONTINUE 340 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'AXPY: Both from memory, y with stride 16, x with stride 1' C C AXPY: Both from memory, both with stride 16 C OPS = 2*DBLE(IMAX2) IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6))) S0 = SECOND() S1 = SECOND() DO 380 KOUNT = 1, IREP DO 360 I = 1, IMAX2 B1(1,I) = B1(1,I) + TEMP*B2(1,I) 360 CONTINUE 380 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'AXPY: Both from memory, both with stride 16' C C DGEMV: In cache, A not transposed, DOT C OPS = 2*DBLE(NACTUAL)**2 IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6))) S0 = SECOND() S1 = SECOND() DO 440 KOUNT = 1, IREP DO 420 I = 1, NACTUAL TEMP = Y(I) DO 400 J = 1, NACTUAL TEMP = TEMP + A(I,J)*X(J) 400 CONTINUE Y(I) = TEMP 420 CONTINUE 440 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMV: In cache, A not transposed, DOT' C C DGEMV: In cache, A not transposed, DOT unrolled to depth 2 C OPS = 2*DBLE(NACTUAL)**2 IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6))) S0 = SECOND() S1 = SECOND() DO 500 KOUNT = 1, IREP DO 480 I = 1, NACTUAL, 2 T1 = Y(I) T2 = Y(I+1) DO 460 J = 1, NACTUAL T1 = T1 + A(I,J)*X(J) T2 = T2 + A(I+1,J)*X(J) 460 CONTINUE Y(I) = T1 Y(I+1) = T2 480 CONTINUE 500 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMV: In cache, A not transposed, DOT unrolled to depth 2' C C DGEMV: In cache, A not transposed, DOT unrolled to depth 3 C OPS = 2*DBLE(NACTUAL)**2 IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6))) S0 = SECOND() S1 = SECOND() DO 560 KOUNT = 1, IREP DO 540 I = 1, NACTUAL, 3 T1 = Y(I) T2 = Y(I+1) T3 = Y(I+2) DO 520 J = 1, NACTUAL T1 = T1 + A(I,J)*X(J) T2 = T2 + A(I+1,J)*X(J) T3 = T3 + A(I+2,J)*X(J) 520 CONTINUE Y(I) = T1 Y(I+1) = T2 Y(I+2) = T3 540 CONTINUE 560 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMV: In cache, A not transposed, DOT unrolled to depth 3' C C DGEMV: In cache, A not transposed, DOT unrolled to depth 4 C OPS = 2*DBLE(NACTUAL)**2 IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6))) S0 = SECOND() S1 = SECOND() DO 620 KOUNT = 1, IREP DO 600 I = 1, NACTUAL, 4 T1 = Y(I) T2 = Y(I+1) T3 = Y(I+2) T4 = Y(I+3) DO 580 J = 1, NACTUAL T1 = T1 + A(I,J)*X(J) T2 = T2 + A(I+1,J)*X(J) T3 = T3 + A(I+2,J)*X(J) T4 = T4 + A(I+3,J)*X(J) 580 CONTINUE Y(I) = T1 Y(I+1) = T2 Y(I+2) = T3 Y(I+3) = T4 600 CONTINUE 620 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMV: In cache, A not transposed, DOT unrolled to depth 4' C C DGEMV: In cache, A not transposed, AXPY C OPS = 2*DBLE(NACTUAL)**2 IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6))) S0 = SECOND() S1 = SECOND() DO 680 KOUNT = 1, IREP DO 660 J = 1, NACTUAL DO 640 I = 1, NACTUAL Y(I) = Y(I) + A(I,J)*X(J) 640 CONTINUE 660 CONTINUE 680 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMV: In cache, A not transposed, AXPY' C C DGEMV: In cache, A not transposed, AXPY unrolled to depth 2 C OPS = 2*DBLE(NACTUAL)**2 IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6))) S0 = SECOND() S1 = SECOND() DO 740 KOUNT = 1, IREP DO 720 J = 1, NACTUAL, 2 DO 700 I = 1, NACTUAL Y(I) = Y(I) + A(I,J)*X(J) + A(I,J+1)*X(J+1) 700 CONTINUE 720 CONTINUE 740 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMV: In cache, A not transposed, AXPY unrolled to depth 2' C C DGEMV: In cache, A not transposed, AXPY unrolled to depth 3 C OPS = 2*DBLE(NACTUAL)**2 IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6))) S0 = SECOND() S1 = SECOND() DO 800 KOUNT = 1, IREP DO 780 J = 1, NACTUAL, 3 DO 760 I = 1, NACTUAL Y(I) = Y(I) + A(I,J)*X(J) + A(I,J+1)*X(J+1) + A(I,J+2) + *X(J+2) 760 CONTINUE 780 CONTINUE 800 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMV: In cache, A not transposed, AXPY unrolled to depth 3' C C DGEMV: In cache, A not transposed, AXPY unrolled to depth 4 C OPS = 2*DBLE(NACTUAL)**2 IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6))) S0 = SECOND() S1 = SECOND() DO 860 KOUNT = 1, IREP DO 840 J = 1, NACTUAL, 4 DO 820 I = 1, NACTUAL Y(I) = Y(I) + A(I,J)*X(J) + A(I,J+1)*X(J+1) + A(I,J+2) + *X(J+2) + A(I,J+3)*X(J+3) 820 CONTINUE 840 CONTINUE 860 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMV: In cache, A not transposed, AXPY unrolled to depth 4' C C DGEMV: In cache, A**T, DOT C OPS = 2*DBLE(NACTUAL)**2 IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6))) S0 = SECOND() S1 = SECOND() DO 920 KOUNT = 1, IREP DO 900 I = 1, NACTUAL TEMP = Y(I) DO 880 J = 1, NACTUAL TEMP = TEMP + A(J,I)*X(J) 880 CONTINUE Y(I) = TEMP 900 CONTINUE 920 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMV: In cache, A**T, DOT' C C C DGEMV: In cache, A**T, DOT unrolled to depth 2 C OPS = 2*DBLE(NACTUAL)**2 IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6))) S0 = SECOND() S1 = SECOND() DO 980 KOUNT = 1, IREP DO 960 I = 1, NACTUAL, 2 T1 = Y(I) T2 = Y(I+1) DO 940 J = 1, NACTUAL T1 = T1 + A(J,I)*X(J) T2 = T2 + A(J,I+1)*X(J) 940 CONTINUE Y(I) = T1 Y(I+1) = T2 960 CONTINUE 980 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMV: In cache, A**T, DOT unrolled to depth 2' C C DGEMV: In cache, A**T, DOT unrolled to depth 3 C OPS = 2*DBLE(NACTUAL)**2 IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6))) S0 = SECOND() S1 = SECOND() DO 1040 KOUNT = 1, IREP DO 1020 I = 1, NACTUAL, 3 T1 = Y(I) T2 = Y(I+1) T3 = Y(I+2) DO 1000 J = 1, NACTUAL T1 = T1 + A(J,I)*X(J) T2 = T2 + A(J,I+1)*X(J) T3 = T3 + A(J,I+2)*X(J) 1000 CONTINUE Y(I) = T1 Y(I+1) = T2 Y(I+2) = T3 1020 CONTINUE 1040 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMV: In cache, A**T, DOT unrolled to depth 3' C C DGEMV: In cache, A**T, DOT unrolled to depth 4 C OPS = 2*DBLE(NACTUAL)**2 IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6))) S0 = SECOND() S1 = SECOND() DO 1100 KOUNT = 1, IREP DO 1080 I = 1, NACTUAL, 4 T1 = Y(I) T2 = Y(I+1) T3 = Y(I+2) T4 = Y(I+3) DO 1060 J = 1, NACTUAL T1 = T1 + A(J,I)*X(J) T2 = T2 + A(J,I+1)*X(J) T3 = T3 + A(J,I+2)*X(J) T4 = T4 + A(J,I+3)*X(J) 1060 CONTINUE Y(I) = T1 Y(I+1) = T2 Y(I+2) = T3 Y(I+3) = T4 1080 CONTINUE 1100 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMV: In cache, A**T, DOT unrolled to depth 4' C C DGEMV: In cache, A**T, DOT unrolled to depth 4 C OPS = 2*DBLE(NACTUAL)**2 IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6))) S0 = SECOND() S1 = SECOND() DO 1160 KOUNT = 1, IREP DO 1140 I = 1, NACTUAL, 8 T1 = Y(I) T2 = Y(I+1) T3 = Y(I+2) T4 = Y(I+3) T5 = Y(I+4) T6 = Y(I+5) T7 = Y(I+6) T8 = Y(I+7) DO 1120 J = 1, NACTUAL T1 = T1 + A(J,I)*X(J) T2 = T2 + A(J,I+1)*X(J) T3 = T3 + A(J,I+2)*X(J) T4 = T4 + A(J,I+3)*X(J) T5 = T5 + A(J,I+4)*X(J) T6 = T6 + A(J,I+5)*X(J) T7 = T7 + A(J,I+6)*X(J) T8 = T8 + A(J,I+7)*X(J) 1120 CONTINUE Y(I) = T1 Y(I+1) = T2 Y(I+2) = T3 Y(I+3) = T4 Y(I+4) = T5 Y(I+5) = T6 Y(I+6) = T7 Y(I+7) = T8 1140 CONTINUE 1160 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMV: In cache, A**T, DOT unrolled to depth 8' C C DGEMV: In cache, A**T, AXPY C OPS = 2*DBLE(NACTUAL)**2 IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6))) S0 = SECOND() S1 = SECOND() DO 1220 KOUNT = 1, IREP DO 1200 J = 1, NACTUAL DO 1180 I = 1, NACTUAL Y(I) = Y(I) + A(J,I)*X(J) 1180 CONTINUE 1200 CONTINUE 1220 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMV: In cache, A**T, AXPY' C C DGEMV: In cache, A**T, AXPY unrolled to depth 2 C OPS = 2*DBLE(NACTUAL)**2 IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6))) S0 = SECOND() S1 = SECOND() DO 1280 KOUNT = 1, IREP DO 1260 J = 1, NACTUAL, 2 DO 1240 I = 1, NACTUAL Y(I) = Y(I) + A(J,I)*X(J) + A(J+1,I)*X(J+1) 1240 CONTINUE 1260 CONTINUE 1280 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMV: In cache, A**T, AXPY unrolled to depth 2' C C DGEMV: In cache, A**T, AXPY unrolled to depth 3 C OPS = 2*DBLE(NACTUAL)**2 IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6))) S0 = SECOND() S1 = SECOND() DO 1340 KOUNT = 1, IREP DO 1320 J = 1, NACTUAL, 3 DO 1300 I = 1, NACTUAL Y(I) = Y(I) + A(J,I)*X(J) + A(J+1,I)*X(J+1) + A(J+2,I) + *X(J+2) 1300 CONTINUE 1320 CONTINUE 1340 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMV: In cache, A**T, AXPY unrolled to depth 3' C C C DGEMV: In cache, A**T, AXPY unrolled to depth 4 C OPS = 2*DBLE(NACTUAL)**2 IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6))) S0 = SECOND() S1 = SECOND() DO 1400 KOUNT = 1, IREP DO 1380 J = 1, NACTUAL, 4 DO 1360 I = 1, NACTUAL Y(I) = Y(I) + A(J,I)*X(J) + A(J+1,I)*X(J+1) + A(J+2,I) + *X(J+2) + A(J+3,I)*X(J+3) 1360 CONTINUE 1380 CONTINUE 1400 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMV: In cache, A**T, AXPY unrolled to depth 4' C C DGEMV: From memory, A not transposed, DOT C OPS = 2*DBLE(NACTUAL)**2 IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6+TFLUSH))) S0 = SECOND() S1 = SECOND() DO 1460 KOUNT = 1, IREP CALL FLUSH(IWORDS) DO 1440 I = 1, NACTUAL TEMP = Y(I) DO 1420 J = 1, NACTUAL TEMP = TEMP + A(I,J)*X(J) 1420 CONTINUE Y(I) = TEMP 1440 CONTINUE 1460 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) - TFLUSH*IREP WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMV: From memory, A not transposed, DOT' C C DGEMV: From memory, A not transposed, DOT unrolled to depth 2 C OPS = 2*DBLE(NACTUAL)**2 IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6+TFLUSH))) S0 = SECOND() S1 = SECOND() DO 1520 KOUNT = 1, IREP CALL FLUSH(IWORDS) DO 1500 I = 1, NACTUAL, 2 T1 = Y(I) T2 = Y(I+1) DO 1480 J = 1, NACTUAL T1 = T1 + A(I,J)*X(J) T2 = T2 + A(I+1,J)*X(J) 1480 CONTINUE Y(I) = T1 Y(I+1) = T2 1500 CONTINUE 1520 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) - TFLUSH*IREP WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMV: From memory, A not transposed, DOT unrolled to depth 2' C C DGEMV: From memory, A not transposed, DOT unrolled to depth 3 C OPS = 2*DBLE(NACTUAL)**2 IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6+TFLUSH))) S0 = SECOND() S1 = SECOND() DO 1580 KOUNT = 1, IREP CALL FLUSH(IWORDS) DO 1560 I = 1, NACTUAL, 3 T1 = Y(I) T2 = Y(I+1) T3 = Y(I+2) DO 1540 J = 1, NACTUAL T1 = T1 + A(I,J)*X(J) T2 = T2 + A(I+1,J)*X(J) T3 = T3 + A(I+2,J)*X(J) 1540 CONTINUE Y(I) = T1 Y(I+1) = T2 Y(I+2) = T3 1560 CONTINUE 1580 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) - TFLUSH*IREP WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMV: From memory, A not transposed, DOT unrolled to depth 3' C C DGEMV: From memory, A not transposed, DOT unrolled to depth 4 C OPS = 2*DBLE(NACTUAL)**2 IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6+TFLUSH))) S0 = SECOND() S1 = SECOND() DO 1640 KOUNT = 1, IREP CALL FLUSH(IWORDS) DO 1620 I = 1, NACTUAL, 4 T1 = Y(I) T2 = Y(I+1) T3 = Y(I+2) T4 = Y(I+3) DO 1600 J = 1, NACTUAL T1 = T1 + A(I,J)*X(J) T2 = T2 + A(I+1,J)*X(J) T3 = T3 + A(I+2,J)*X(J) T4 = T4 + A(I+3,J)*X(J) 1600 CONTINUE Y(I) = T1 Y(I+1) = T2 Y(I+2) = T3 Y(I+3) = T4 1620 CONTINUE 1640 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) - TFLUSH*IREP WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMV: From memory, A not transposed, DOT unrolled to depth 4' C C DGEMV: From memory, A not transposed, AXPY C OPS = 2*DBLE(NACTUAL)**2 IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6+TFLUSH))) S0 = SECOND() S1 = SECOND() DO 1700 KOUNT = 1, IREP CALL FLUSH(IWORDS) DO 1680 J = 1, NACTUAL DO 1660 I = 1, NACTUAL Y(I) = Y(I) + A(I,J)*X(J) 1660 CONTINUE 1680 CONTINUE 1700 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) - TFLUSH*IREP WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMV: From memory, A not transposed, AXPY' C C DGEMV: From memory, A not transposed, AXPY unrolled to depth 2 C OPS = 2*DBLE(NACTUAL)**2 IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6+TFLUSH))) S0 = SECOND() S1 = SECOND() DO 1760 KOUNT = 1, IREP CALL FLUSH(IWORDS) DO 1740 J = 1, NACTUAL, 2 DO 1720 I = 1, NACTUAL Y(I) = Y(I) + A(I,J)*X(J) + A(I,J+1)*X(J+1) 1720 CONTINUE 1740 CONTINUE 1760 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) - TFLUSH*IREP WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMV: From memory, A not transposed, AXPY unrolled to depth 2' C C DGEMV: From memory, A not transposed, AXPY unrolled to depth 3 C OPS = 2*DBLE(NACTUAL)**2 IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6+TFLUSH))) S0 = SECOND() S1 = SECOND() DO 1820 KOUNT = 1, IREP CALL FLUSH(IWORDS) DO 1800 J = 1, NACTUAL, 3 DO 1780 I = 1, NACTUAL Y(I) = Y(I) + A(I,J)*X(J) + A(I,J+1)*X(J+1) + A(I,J+1) + *X(J+2) 1780 CONTINUE 1800 CONTINUE 1820 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) - TFLUSH*IREP WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMV: From memory, A not transposed, AXPY unrolled to depth 3' C C DGEMV: From memory, A not transposed, AXPY unrolled to depth 4 C OPS = 2*DBLE(NACTUAL)**2 IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6+TFLUSH))) S0 = SECOND() S1 = SECOND() DO 1880 KOUNT = 1, IREP CALL FLUSH(IWORDS) DO 1860 J = 1, NACTUAL, 4 DO 1840 I = 1, NACTUAL Y(I) = Y(I) + A(I,J)*X(J) + A(I,J+1)*X(J+1) + A(I,J+2) + *X(J+2) + A(I,J+3)*X(J+3) 1840 CONTINUE 1860 CONTINUE 1880 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) - TFLUSH*IREP WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMV: From memory, A not transposed, AXPY unrolled to depth 4' C C DGEMV: From memory, A**T, DOT C OPS = 2*DBLE(NACTUAL)**2 IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6+TFLUSH))) S0 = SECOND() S1 = SECOND() DO 1940 KOUNT = 1, IREP CALL FLUSH(IWORDS) DO 1920 I = 1, NACTUAL TEMP = Y(I) DO 1900 J = 1, NACTUAL TEMP = TEMP + A(J,I)*X(J) 1900 CONTINUE Y(I) = TEMP 1920 CONTINUE 1940 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) - TFLUSH*IREP WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMV: From memory, A**T, DOT' C C DGEMV: From memory, A**T, DOT unrolled to depth 2 C OPS = 2*DBLE(NACTUAL)**2 IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6+TFLUSH))) S0 = SECOND() S1 = SECOND() DO 2000 KOUNT = 1, IREP CALL FLUSH(IWORDS) DO 1980 I = 1, NACTUAL, 2 T1 = Y(I) T2 = Y(I+1) DO 1960 J = 1, NACTUAL T1 = T1 + A(J,I)*X(J) T2 = T2 + A(J,I+1)*X(J) 1960 CONTINUE Y(I) = T1 Y(I+1) = T2 1980 CONTINUE 2000 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) - TFLUSH*IREP WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMV: From memory, A**T, DOT unrolled to depth 2' C C DGEMV: From memory, A**T, DOT unrolled to depth 3 C OPS = 2*DBLE(NACTUAL)**2 IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6+TFLUSH))) S0 = SECOND() S1 = SECOND() DO 2060 KOUNT = 1, IREP CALL FLUSH(IWORDS) DO 2040 I = 1, NACTUAL, 3 T1 = Y(I) T2 = Y(I+1) T3 = Y(I+2) DO 2020 J = 1, NACTUAL T1 = T1 + A(J,I)*X(J) T2 = T2 + A(J,I+1)*X(J) T3 = T3 + A(J,I+2)*X(J) 2020 CONTINUE Y(I) = T1 Y(I+1) = T2 Y(I+2) = T3 2040 CONTINUE 2060 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) - TFLUSH*IREP WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMV: From memory, A**T, DOT unrolled to depth 3' C C DGEMV: From memory, A**T, DOT unrolled to depth 4 C OPS = 2*DBLE(NACTUAL)**2 IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6+TFLUSH))) S0 = SECOND() S1 = SECOND() DO 2120 KOUNT = 1, IREP CALL FLUSH(IWORDS) DO 2100 I = 1, NACTUAL, 4 T1 = Y(I) T2 = Y(I+1) T3 = Y(I+2) T4 = Y(I+3) DO 2080 J = 1, NACTUAL T1 = T1 + A(J,I)*X(J) T2 = T2 + A(J,I+1)*X(J) T3 = T3 + A(J,I+2)*X(J) T4 = T4 + A(J,I+3)*X(J) 2080 CONTINUE Y(I) = T1 Y(I+1) = T2 Y(I+2) = T3 Y(I+3) = T4 2100 CONTINUE 2120 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) - TFLUSH*IREP WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMV: From memory, A**T, DOT unrolled to depth 4' C C DGEMV: From memory, A**T, AXPY C OPS = 2*DBLE(NACTUAL)**2 IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6+TFLUSH))) S0 = SECOND() S1 = SECOND() DO 2180 KOUNT = 1, IREP CALL FLUSH(IWORDS) DO 2160 J = 1, NACTUAL DO 2140 I = 1, NACTUAL Y(I) = Y(I) + A(J,I)*X(J) 2140 CONTINUE 2160 CONTINUE 2180 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) - TFLUSH*IREP WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMV: From memory, A**T, AXPY' C C DGEMV: From memory, A**T, AXPY unrolled to depth 2 C OPS = 2*DBLE(NACTUAL)**2 IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6+TFLUSH))) S0 = SECOND() S1 = SECOND() DO 2240 KOUNT = 1, IREP CALL FLUSH(IWORDS) DO 2220 J = 1, NACTUAL, 2 DO 2200 I = 1, NACTUAL Y(I) = Y(I) + A(J,I)*X(J) + A(J+1,I)*X(J+1) 2200 CONTINUE 2220 CONTINUE 2240 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) - TFLUSH*IREP WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMV: From memory, A**T, AXPY unrolled to depth 2' C C DGEMV: From memory, A**T, AXPY unrolled to depth 3 C OPS = 2*DBLE(NACTUAL)**2 IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6+TFLUSH))) S0 = SECOND() S1 = SECOND() DO 2300 KOUNT = 1, IREP CALL FLUSH(IWORDS) DO 2280 J = 1, NACTUAL, 3 DO 2260 I = 1, NACTUAL Y(I) = Y(I) + A(J,I)*X(J) + A(J+1,I)*X(J+1) + A(J+2,I) + *X(J+2) 2260 CONTINUE 2280 CONTINUE 2300 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) - TFLUSH*IREP WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMV: From memory, A**T, AXPY unrolled to depth 3' C C DGEMV: From memory, A**T, AXPY unrolled to depth 4 C OPS = 2*DBLE(NACTUAL)**2 IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6+TFLUSH))) S0 = SECOND() S1 = SECOND() DO 2360 KOUNT = 1, IREP CALL FLUSH(IWORDS) DO 2340 J = 1, NACTUAL, 4 DO 2320 I = 1, NACTUAL Y(I) = Y(I) + A(J,I)*X(J) + A(J+1,I)*X(J+1) + A(J+2,I) + *X(J+2) + A(J+3,I)*X(J+3) 2320 CONTINUE 2340 CONTINUE 2360 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) - TFLUSH*IREP WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMV: From memory, A**T, AXPY unrolled to depth 4' C C DGEMM: From cache, C <- C + AB, DOT unrolled twice C AVMFLP = 40.0D0 NACTUAL = 16 OPS = 2*DBLE(NACTUAL)**2*MAXN IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6))) S0 = SECOND() S1 = SECOND() DO 2440 KOUNT = 1, IREP DO 2420 J = 1, NACTUAL, 2 DO 2400 I = 1, NACTUAL, 2 T11 = C(I,J) T12 = C(I,J+1) T21 = C(I+1,J) T22 = C(I+1,J+1) DO 2380 K = 1, MAXN T11 = T11 + A(I,K)*B(K,J) T12 = T12 + A(I,K)*B(K,J+1) T21 = T21 + A(I+1,K)*B(K,J) T22 = T22 + A(I+1,K)*B(K,J+1) 2380 CONTINUE C(I,J) = T11 C(I,J+1) = T12 C(I+1,J) = T21 C(I+1,J+1) = T22 2400 CONTINUE 2420 CONTINUE 2440 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMM: From cache, C <- C + AB, DOT unrolled twice' C C DGEMM: From cache, C <- C + A**T*B, DOT unrolled twice C OPS = 2*DBLE(NACTUAL)**2*MAXN IREP = MAX(1,NINT(AVTIME/(OPS/AVMFLP/1.0D6))) S0 = SECOND() S1 = SECOND() DO 2520 KOUNT = 1, IREP DO 2500 J = 1, NACTUAL, 2 DO 2480 I = 1, NACTUAL, 2 T11 = C(I,J) T12 = C(I,J+1) T21 = C(I+1,J) T22 = C(I+1,J+1) DO 2460 K = 1, MAXN T11 = T11 + A(K,I)*B(K,J) T12 = T12 + A(K,I)*B(K,J+1) T21 = T21 + A(K,I+1)*B(K,J) T22 = T22 + A(K,I+1)*B(K,J+1) 2460 CONTINUE C(I,J) = T11 C(I,J+1) = T12 C(I+1,J) = T21 C(I+1,J+1) = T22 2480 CONTINUE 2500 CONTINUE 2520 CONTINUE S2 = SECOND() TIME = (S2-S1) - (S1-S0) WRITE (6,FMT=99999) OPS*IREP/TIME/1.0D6, + 'DGEMM: From cache, C <- C + A**T*B, DOT unrolled twice' STOP C 99999 FORMAT (1X,F5.2,' mflops -- ',A) END SUBROUTINE INIT(A,M,N) C C Initializes an array C C .. Scalar Arguments .. INTEGER M, N C .. Array Arguments .. DOUBLE PRECISION A(M,N) C .. Local Scalars .. INTEGER I, J C .. Intrinsic Functions .. INTRINSIC DBLE C .. Executable Statements .. DO 40 J = 1, N DO 20 I = 1, M A(I,J) = 0.01D0*DBLE(J+I) 20 CONTINUE 40 CONTINUE RETURN END SUBROUTINE FLUSH(IWORDS) C C This subroutine causes IWORDS words to be initialized. If it C is called before a code fragment, it will make sure that C the cache is flushed completely. C C .. Local Scalars .. INTEGER I C .. Local Arrays .. DOUBLE PRECISION FOO(8192) C .. Scalar Arguments .. INTEGER IWORDS C .. Executable Statements .. IF (IWORDS.GT.8192) THEN WRITE (6,FMT=*) + '*** Increase size of array in subroutine FLUSH' STOP END IF DO 20 I = 1, IWORDS FOO(I) = 1.1D0 20 CONTINUE RETURN END .