rntsay
Member level 4
fast asembly fir filter
/* 16-tap pipelined dot product for Pentium, equivalent to (but much faster
* than) the C routine:
*
* float fir16(float a[16],float b[16])
* {
* int i;
* float sum = 0;
* for(i=0;i<16;i++)
* sum += a*b;
* return sum;
* }
*
* Copyright 1996 Phil Karn, KA9Q
*/
.globl _fir16
_fir16:
pushl %ebx
movl 8(%esp),%eax /* eax -> a[] */
movl 12(%esp),%ebx /* ebx -> b[] */
flds (%eax)
fmuls (%ebx) /* a[0]*b[0] */
flds 4(%eax)
fmuls 4(%ebx) /* a[1]*b[1] a[0]*b[0] */
flds 8(%eax)
fmuls 8(%ebx) /* a[2]*b[2] a[1]*b[1] a[0]*b[0] */
fxch %st(2) /* a[0]*b[0] a[1]*b[1] a[2]*b[2] */
faddp /* sum0,1 a[2]*b[2] */
flds 12(%eax)
fmuls 12(%ebx)/* a[3]*b[3] sum0,1 a[2]*b[2] */
fxch %st(2) /* a[2]*b[2] sum0,1 a[3]*b[3] */
faddp /* sum0,1,2 a[3]*b[3] */
flds 16(%eax)
fmuls 16(%ebx)
fxch %st(2)
faddp
flds 20(%eax)
fmuls 20(%ebx)
fxch %st(2)
faddp
flds 24(%eax)
fmuls 24(%ebx)
fxch %st(2)
faddp
flds 28(%eax)
fmuls 28(%ebx)
fxch %st(2)
faddp
flds 32(%eax)
fmuls 32(%ebx)
fxch %st(2)
faddp
flds 36(%eax)
fmuls 36(%ebx)
fxch %st(2)
faddp
flds 40(%eax)
fmuls 40(%ebx)
fxch %st(2)
faddp
flds 44(%eax)
fmuls 44(%ebx)
fxch %st(2)
faddp
flds 48(%eax)
fmuls 48(%ebx)
fxch %st(2)
faddp
flds 52(%eax)
fmuls 52(%ebx)
fxch %st(2)
faddp
flds 56(%eax)
fmuls 56(%ebx)
fxch %st(2)
faddp
flds 60(%eax)
fmuls 60(%ebx)
fxch %st(2)
faddp /* sum a[15]*b[15] */
pop %ebx
faddp /* sum left on stack */
ret
/* 16-tap pipelined dot product for Pentium, equivalent to (but much faster
* than) the C routine:
*
* float fir16(float a[16],float b[16])
* {
* int i;
* float sum = 0;
* for(i=0;i<16;i++)
* sum += a*b;
* return sum;
* }
*
* Copyright 1996 Phil Karn, KA9Q
*/
.globl _fir16
_fir16:
pushl %ebx
movl 8(%esp),%eax /* eax -> a[] */
movl 12(%esp),%ebx /* ebx -> b[] */
flds (%eax)
fmuls (%ebx) /* a[0]*b[0] */
flds 4(%eax)
fmuls 4(%ebx) /* a[1]*b[1] a[0]*b[0] */
flds 8(%eax)
fmuls 8(%ebx) /* a[2]*b[2] a[1]*b[1] a[0]*b[0] */
fxch %st(2) /* a[0]*b[0] a[1]*b[1] a[2]*b[2] */
faddp /* sum0,1 a[2]*b[2] */
flds 12(%eax)
fmuls 12(%ebx)/* a[3]*b[3] sum0,1 a[2]*b[2] */
fxch %st(2) /* a[2]*b[2] sum0,1 a[3]*b[3] */
faddp /* sum0,1,2 a[3]*b[3] */
flds 16(%eax)
fmuls 16(%ebx)
fxch %st(2)
faddp
flds 20(%eax)
fmuls 20(%ebx)
fxch %st(2)
faddp
flds 24(%eax)
fmuls 24(%ebx)
fxch %st(2)
faddp
flds 28(%eax)
fmuls 28(%ebx)
fxch %st(2)
faddp
flds 32(%eax)
fmuls 32(%ebx)
fxch %st(2)
faddp
flds 36(%eax)
fmuls 36(%ebx)
fxch %st(2)
faddp
flds 40(%eax)
fmuls 40(%ebx)
fxch %st(2)
faddp
flds 44(%eax)
fmuls 44(%ebx)
fxch %st(2)
faddp
flds 48(%eax)
fmuls 48(%ebx)
fxch %st(2)
faddp
flds 52(%eax)
fmuls 52(%ebx)
fxch %st(2)
faddp
flds 56(%eax)
fmuls 56(%ebx)
fxch %st(2)
faddp
flds 60(%eax)
fmuls 60(%ebx)
fxch %st(2)
faddp /* sum a[15]*b[15] */
pop %ebx
faddp /* sum left on stack */
ret