From greenfield%dvinci.DEC@decwrl.ARPA Thu Jul 11 14:07:17 1985
Received: from decwrl.ARPA (decwrl.arpa.ARPA) by anl-mcs.ARPA ; Thu, 11 Jul 85 14:06:32 cdt
Received: from DEC-RHEA.ARPA by decwrl.ARPA (4.22.01/4.7.34)
	id AA29594; Thu, 11 Jul 85 10:23:48 pdt
Message-Id: <8507111723.AA29594@decwrl.ARPA>
Date: Thursday, 11 Jul 1985 10:14:06-PDT
From: greenfield%dvinci.DEC@decwrl.ARPA  (Mike Greenfield  mr3-1/e13,dtn231-7481 )
To: dongarra@anl-mcs
Subject: macro32 linpack blas
Status: RO

	Here are the latest versions.  Beware of the warnings (see src code).

	Mike


*****************    sax7.mar *************************************************
*****************    sax7.mar *************************************************
*****************    sax7.mar *************************************************

	.TITLE	SAXPY --  SAX7 LINPACK BLAS optimized for VAX/VMS
	.IDENT  /1.0a/

; This routine replaces the following FORTRAN code from the LINPACK BLAS:
;
;	Subroutine Saxpy(n,da,dx,incx,dy,incy)
;	Real dx(1),dy(1),da
; !	(Note - The distributed BLAS unrolls the following DO loop)
;	Do i = 1,n
;	dy(i) = dy(i) + da*dx(i)
;	Enddo
;	Return
;	End
;
; Author:
;
;	Mike Greenfield
;	Computer Aided Engineering and Manufacturing Group
;	Digital Equipment Corporation
;	2 Iron Way, MR3-1/E13
;	Marlboro, Massachusetts
;	3/85
;
;
; Warnings:
; This routine is slightly different from it's FORTRAN counter-part.  In 
; particular, different behavior will be observed if input arrays overlap with 
; each other or with the CONSTANT argument. Also, the order in which the 
; expressions are evaluated is somewhat modified.  This can cause problems if 
; the evaluation order is important and the arrays have been pre-sorted.
; This routine has only been tested for the case where unit strides are 
; specified (i.e. INCX=INCY=1).
;
;		
;

	.PSECT	$CODE	PIC,CON,REL,LCL,SHR,EXE,RD,NOWRT,QUAD

N=4			; Define stack offsets
Da=8
Dx=12
Incx=16
Dy=20
Incy=24

	.MACRO	Mul_Add Times
	.Repeat	Times
	MULF3	-(R7), R5, R9	; dx(i) * constant
	ADDF2	R9, -(R8)	; Y(i) = Y(i) + above
	.Endr
	.Endm	Mul_Add

	.MACRO	Mul_Add_Pair Times
	.Repeat	Times
;
; Special loop construct to reduce scoreboard stall hit.  This optimization
; was done specifically for the VAX 8600, but it turns out that there is
; no penalty for the other processor models.
;
	MULF3	-(R7), R5, R9	; dx(i) * constant; Stuff result into r9
	MULF3	-(R7), R5, R10	; dx(i) * constant; Stuff result into r10
	ADDF2	R9, -(R8)	; Y(i) = Y(i) + above
	ADDF2	R10, -(R8)	; Y(i) = Y(i) + above
	.Endr
	.Endm	Mul_Add_Pair

	
	.Entry	SAXPY	^M<IV,R5,R6,R7,R8,R9,R10>	; Entry point and
							; register mask

	MOVF	@DA(AP), R5	; Constant into R5
	MOVL	@N(AP),  R6	; Vector Length into R6
	BGTR	SET_UP		; If N>0, then set up 
	RET			; Else exit if length is 0

;
; Check for incx*incy .ne. 1
;
set_up:	cmpl	@incx(ap), #1	; Incx = 1 ?
	bneq	unequal		; Take the un-optimized code path
	cmpl	@incy(ap), #1	; Incy = 1 ?
	bneq	unequal		; Take the un-optimized code path
	jmp	optimized_code_path

unequal:
	jmp	spec_incs	; infrequently traversed path

optimized_code_path:

	mull3	#4, r6, r9	; Compute address range in bytes

	addl3	dx(ap), r9, r7	; Address of 1 element past top dx element
	addl3	dy(ap), r9, r8	; Address of 1 element past top dy element

	blbc	r6, loop1_setup	; if even number, go to loop1_setup now

	Mul_Add 1		; Do Vector operation, now remainder is even

loop1_setup:
	ashl	#-1,r6,r6	; divide count by 2
	bgtr	cont1		; if count is >0, then continue
	ret			; else we are done
cont1:	blbc	r6, loop2_setup	; if even number, go to loop2_setup now

	Mul_Add_Pair 1		; Do vector Operation

loop2_setup:
	ashl	#-1,r6,r6	; divide count by 2
	bgtr	cont2		; if count is >0, then continue
	ret			; else we are done
cont2:	blbc	r6, loop3_setup	; if even number, go to loop3_setup now

	Mul_Add_Pair 2		; Do vector Operation

loop3_setup:
	ashl	#-1,r6,r6	; divide count by 2
	bgtr	cont3		; if count is >0, then continue
	ret			; else we are done
cont3:	blbc	r6, loop4_setup	; if even number, go to loop4_setup now

	Mul_Add_Pair 4		; Do vector Operation

loop4_setup:
	ashl	#-1,r6,r6	; divide count by 2
	bgtr	loopn		; if count is >0, then continue
	ret			; else we are done

; <already aligned>	<NOP>	; Align loop on longword boundary

loopn:	
	Mul_Add_Pair 8		; Do vector operation
	sobgtr	r6, loopn	; loop again until countdown complete

DONE:	RET	


spec_incs:			; ***WARNING****
				; This section of the code has not yet 
				; been tested.  It is not traversed in the
				; LINPACK benchmark.
	pushr	#^M<r0,r11>

	subl3	#4, dx(ap), r7	; move base address of dx vector into r7
	subl3	#4, dy(ap), r6	; move base address of dy vector into r6
	movl	#1, r10		; iy = 1

	tstl	@incx(ap)	; incx=0?
	bgeq	l$4		;
	subl3	@n(ap), #1, r0	; ix = (1-n)
	mull2	@incx(ap), r0	; ix = (1-n) * incx
	addl3	#1, r0, r9	; ix = (1-n) * incx + 1
l$4:
	tstl	@incy(ap)	; incy=0?
	bgeq	l$5		;
	subl3	@n(ap), #1, r0	; iy = (1-n)
	mull2	@incy(ap), r0	; iy = (1-n) * incy
	addl3	#1, r0, r10	; iy = (1-n) * incy + 1
l$5:
	movl	@n(ap), r8	; Length to r8
	movl	#1, r11		; loop index in r11
	cmpl	r11, r8		; index .le. 0?
	bgtr	l$7		; Branch if yes

l$6:
	mulf3	(r7)[r9], r5, r0
	addf2	r0, (r6)[r10]
	addl2	@incx(ap), r9
	addl2	@incy(ap), r10
	aobleq	r8, r11, l$6

	popr	#^M<r0,r11>
l$7:	ret
	.END

*****************    dax7.mar *************************************************
*****************    dax7.mar *************************************************
*****************    dax7.mar *************************************************

	.TITLE	DAXPY --  DAX7 LINPACK BLAS optimized for VAX/VMS
	.IDENT 	/1.0a/
;
; This routine replaces the following FORTRAN code from the LINPACK BLAS:
;
;	Subroutine Daxpy(n,da,dx,incx,dy,incy)
;	Real*8 dx(1),dy(1),da
; !	Note - The distributes BLAS unrolls the following DO loop
;	Do i = 1,n
;	dy(i) = dy(i) + da*dx(i)
;	Enddo
;	Return
;	End
;
; Note: The original DAXPY routine supported strides (incx,incy .ne. 1).
;	This routine was developed for Jack Dongarra's LINPACK benchmark
;	(which only uses unit strides) and only supports the case where
;	incx=incy=1.  If an attempt is made to use it without unit strides,
;	the routine will display an error message and abort the program.
;
; Author:
;
;	Mike Greenfield
;	Computer Aided Engineering and Manufacturing Group
;	Digital Equipment Corporation
;	2 Iron Way, MR3-1/E13
;	Marlboro, Massachusetts
;	3/85
;
;
; Warnings:
; This routine is slightly different from it's FORTRAN counter-part.  In 
; particular, different behavior will be observed if input arrays overlap with 
; each other or with the CONSTANT argument. Also, the order in which the 
; expressions are evaluated is somewhat modified.  This can cause problems if 
; the evaluation order is important and the arrays have been pre-sorted.
; This routine has only been tested for the case where unit strides are 
; specified (i.e. INCX=INCY=1).
;
;		
;
	.PSECT	$data_x	PIC,CON,REL,LCL,NOSHR,NOEXE,RD,NOWRT,QUAD
Msg:	.ascid /MACRO coded BLAS routine DAXPY does not support incx*incy .ne. 1/

	.PSECT	$CODE	PIC,CON,REL,LCL,SHR,EXE,RD,NOWRT,QUAD

n=4			; define stack offsets
da=8
dx=12
incx=16
dy=20
incy=24

	.MACRO	MUL_ADD_PAIR Times
	.Repeat	Times
	MULD3	-(R7), R4, R9	; dx(i) * constant
	MULD3	-(R7), R4, R2	; dx(i) * constant
	ADDD2	R9, -(R8)	; Y(i) = Y(i) + above
	ADDD2	R2, -(R8)	; Y(i) = Y(i) + above
	.Endr
	.Endm	MUL_ADD_PAIR


	.ENTRY	DAXPY	^M<IV,R2,R3,R4,R5,R6,R7,R8,R9,R10>

	MOVD	@DA(AP), R4	; Constant into R4,R5
	MOVL	@N(AP),  R6	; Vector Length into R6
	BGTR	SET_UP		; If N>0, then set up 
	RET			; Else exit if length is 0

;
; Check for incx*incy .ne. 1
;
set_up:	cmpl	@incx(ap), #1	; Incx = 1 ?
	bneq	unequal		; Take the un-optimized code path
	cmpl	@incy(ap), #1	; Incy = 1 ?
	bneq	unequal		; Take the un-optimized code path
	jmp	optimized_code_path

unequal:
	jmp	spec_incs	; infrequently traversed path

optimized_code_path:

	mull3	#8,r6, r9	; Compute address range in bytes
	addl3	dx(ap),r9,r7	; Address of one element past top dx element
	addl3	dy(ap),r9,r8	; Address of one element past top dx element

	blbc	r6, loop1_setup	; if even number, go to loop1_setup now

	MULD3	-(R7), R4, R9	; dx(i) * constant
	ADDD2	R9, -(R8)	; Y(i) = Y(i) + above

loop1_setup:
	ashl	#-1,r6,r6	; divide count by 2
	bgtr	cont1		; if count is >0, then continue
	ret			; else we are done
cont1:	blbc	r6, loop2_setup	; if even number, go to loop2_setup now

	MUL_ADD_PAIR 1		; Do vector Operation

loop2_setup:
	ashl	#-1,r6,r6	; divide count by 2
	bgtr	cont2		; if count is >0, then continue
	ret			; else we are done
cont2:	blbc	r6, loop3_setup	; if even number, go to loop3_setup now

	MUL_ADD_PAIR 2		; Do vector Operation

loop3_setup:
	ashl	#-1,r6,r6	; divide count by 2
	bgtr	cont3		; if count is >0, then continue
	ret			; else we are done
cont3:	blbc	r6, loop4_setup	; if even number, go to loop4_setup now

	MUL_ADD_PAIR 4		; Do vector Operation

loop4_setup:
	ashl	#-1,r6,r6	; divide count by 2
	bgtr	loopn		; if count is >0, then continue
	ret			; else we are done


loopn:	
	MUL_ADD_PAIR 8		; Do vector operation
	sobgtr	r6, loopn	; loop again until countdown complete

DONE:	RET	


spec_incs:			; ***WARNING****
				; This version of DAXPY is NOT implemented to
				; support the case where incx*incy .ne. 1
	pushal 	msg
	Calls	#1, G^Lib$Put_Output
	$Exit_S	#SS$_ABORT	;

	.END


.