

#include "fft_int20.h"
#define FFT_PRINT_OFF
#define FFT_PRINT_STEPS_OFF
#define FFT_PRINT_TWIDDLE_OFF
#define FFT_PRINT_REVERSAL_OFF

/* 1/sqrt(2)*(1<<19) = 370727.6 */
#define onedivsqrt2_int20 370728

#define _complex_addshift_int20(a,b,dest) {\
  *(dest)=(*(a)+*(b)+1)/2 ; (dest)[1]=((a)[1]+(b)[1]+1)/2 ; }

#define _complex_add_int20(a,b,dest) {\
  *(dest)=(*(a)+*(b)); (dest)[1]=((a)[1]+(b)[1]); }

#define _complex_mul_int20(ar,ai,br,bi,desta,destb) { \
      tmpa=ar; \
      tmpa*=br; \
      tmpb=ai; \
      tmpb*=bi; \
      tmpa-=tmpb; /* rounding ? */ \
      tmpa/=(1<<19); \
      desta=tmpa; \
      tmpa=ar; \
      tmpa*=bi; \
      tmpb=ai; \
      tmpb*=br; \
      tmpa+=tmpb; /* rounding ? */ \
      tmpa/=(1<<19); \
      destb=tmpa; \
}

int quadreversal_radix4(int a,int base) {
  int i,b,res=0;

  for (i=0;i<base;i++) {
    b=a/4;
    res=4*res+(a-4*b);
    a=b;
  }
  return(res);
}


/* base is the logarithm log2(N) of the number N of elements in *symb
 */
int fftreversal_radix4(int *symb,int base) {
  int i,j,k,size;
  int tmp;

  /* first and last element are never touched. All others can
     be symetrically switched - but i have to check, i switch
     them only once!
   */
  size=(1<<(base*2));
  for (i=1;i<size-1;i++) {
    j=quadreversal_radix4(i,base);
    
    if (i<j) {
      tmp=symb[2*i];
      symb[2*i]=symb[2*j];
      symb[2*j]=tmp;
      
      tmp=symb[2*i+1];
      symb[2*i+1]=symb[2*j+1];
      symb[2*j+1]=tmp;
    }
  }
}

/* expanded the inner loop */
void fft_radix4_faststep(int *preal,int *pimag,const int size,const int step) {
  register int i,j,alpha,alpha2,alpha3,alpha05;
  register int a,t1,t2,t3,t4,t5,t6,t7,t8; /* temporary */
  register int *real,*imag;

  alpha=2*(1<<(2*step)); /* 2*exp(log(4)*step) */
  alpha2=2*alpha;
  alpha3=3*alpha;
  alpha05=alpha/2;
  for (i=0;i<2*size;i+=(4*alpha)) {
    real=preal+i;
    imag=pimag+i;
    for (j=alpha05;j!=0;j--) {
      /* operand fetch and calculation */
      t1=(*real); /* ar1 +2 rounding? */
      t3=(*imag); /* ai1 +2 rounding? */
      a=real[alpha2];   /* ar3 */
      t2=t1-a;      /* ar1-ar3 */
      t1+=a;        /* ar1+ar3 */
      a=imag[alpha2]; /* ai3 */
      t4=t3-a;      /* ai1-ai3 */
      t3+=a;        /* ai1+ai3 */
      t5=real[alpha];    /* ar2 */
      t7=imag[alpha];  /* ai2 */
      a=real[alpha3];    /* ar4 */
      t6=t5-a;       /* ar2-ar4 */
      t5+=a;         /* ar2+ar4 */
      a=imag[alpha3];  /* ai4 */
      t8=t7-a;       /* ai2-ai4 */
      t7+=a;         /* ai2+ai4 */

      *real=(t1+t5)/4;          /* store b1 */
      *imag=(t3+t7)/4;
      real[alpha3]=(t2-t8)/4;          /* store b4 */
      imag[alpha3]=(t4+t6)/4;
      real[alpha]=(t2+t8)/4;          /* store b2 */
      imag[alpha]=(t4-t6)/4;
      real[alpha2]=(t1-t5)/4;          /* store b3 */
      imag[alpha2]=(t3-t7)/4;

      real+=2;
      imag+=2;
    }
  }
}

void fft_radix4_twiddle(int *preal,int *pimag,const int size,const int step) {
  register int i,j,k,nn,nn2,u,radix,v,w,ar,ai,br,bi;
  register float tmpa,tmpb;
  register int *real,*imag,*Wkreal,*Wkimag;
  static int adds=0,muls=0;

  Wkreal=fft_Wk_int20;
  Wkimag=fft_Wk_int20+1;
  nn=4*(1<<(step*2));
  radix=FFT_MAXSIZE/(2*nn);
  v=2*FFT_MAXSIZE;
#ifdef FFT_PRINT
  printf("twiddle step %d nn=%d v=%d radix=%d\n",step,nn,v,radix);
#endif
  for(i=0;i<size;i+=(4*nn)) {
    w=radix;
    for(k=1;k<4;k++) {
      real=preal+2*(i+k*nn+1);
      imag=pimag+2*(i+k*nn+1);
      u=w;
      for(j=1;j<nn;j++) {
	u%=v;
	ar=*real;ai=*imag;
	_complex_mul_int20(ar,ai,Wkreal[u],Wkimag[u],*real,*imag);

#ifdef FFT_PRINT
	printf("i%3d j%d k%d twiddle %4d factor (%d,%d)\n",i,j,k,u,Wkreal[u],Wkimag[u]);
#endif
	real+=2;imag+=2;
	u+=w;
      }
      w+=radix;
    }
  }
}

void fft_radixQ_twiddle(int *preal,int *pimag,const int size,const int Q,const int step) {
  register int i,j,k,l,nn,nn2,u,radix,v,w,ar,ai,br,bi;
  register float tmpa,tmpb;
  register int *psymb,*Wkreal,*Wkimag;
  static int adds=0,muls=0;

  Wkreal=fft_Wk_int20;
  Wkimag=fft_Wk_int20+1;
  nn=Q*(1<<(step*Q/2)); /* Q*2^(Q*step) */
  radix=FFT_MAXSIZE/(2*nn);
  v=2*FFT_MAXSIZE;
#ifdef FFT_PRINT
  printf("twiddle step %d Q=%d nn=%d v=%d radix=%d\n",step,Q,nn,v,radix);
#endif
  for(i=0;i<size;i+=(Q*nn)) {
    for(k=1;k<Q;k++) {
      l=2*(i+k*nn+1);
      for(j=1;j<nn;j++) {
	u=(k*j*2*radix)%v;
	/* calculating c=b*Wk */
	ar=preal[l];
	ai=pimag[l];
	_complex_mul_int20(ar,ai,Wkreal[u],Wkimag[u],preal[l],pimag[l]);

#ifdef FFT_PRINT
	printf("i%3d j%d k%d l%d twiddle %4d factor (%d,%d)\n",i,j,k,l,u,Wkreal[u],Wkimag[u]);
#endif
	l+=2;
      }
    }
  }
}

#define fft_calctwiddle_N8(idx) { \
    ar=real[idx]; \
    ai=imag[idx]; \
    tmpa=ar+ai;  \
    tmpb=ai-ar;  \
    tmpa*=onedivsqrt2_int20; \
    tmpb*=onedivsqrt2_int20; \
    tmpa/=(1<<19); \
    tmpb/=(1<<19); \
    real[idx]=tmpa; \
    imag[idx]=tmpb; \
}

#define fft_calctwiddle_3N8(idx) { \
    ar=real[idx]; \
    ai=imag[idx]; \
    tmpa=ai-ar;  \
    tmpb=-ar-ai; \
    tmpa*=onedivsqrt2_int20; \
    tmpb*=onedivsqrt2_int20; \
    tmpa/=(1<<19); \
    tmpb/=(1<<19); \
    real[idx]=tmpa; \
    imag[idx]=tmpb; \
}

void fft_radix4_twiddle0(int *preal,int *pimag,const int size) {
  register int i,tmp,ar,ai;
  register int *real,*imag;
  register int wkreal1_16th,wkreal3_16th,wkreal9_16th;
  register int wkimag1_16th,wkimag3_16th,wkimag9_16th;
  register float tmpa,tmpb;

  wkreal1_16th=fft_Wk_int20[FFT_MAXSIZE/8];
  wkimag1_16th=fft_Wk_int20[FFT_MAXSIZE/8+1];
  wkreal3_16th=fft_Wk_int20[3*FFT_MAXSIZE/8];
  wkimag3_16th=fft_Wk_int20[3*FFT_MAXSIZE/8+1];
  wkreal9_16th=fft_Wk_int20[9*FFT_MAXSIZE/8];
  wkimag9_16th=fft_Wk_int20[9*FFT_MAXSIZE/8+1];

  for(i=0;i<2*size;i+=32) {
    real=preal+i;
    imag=pimag+i;
    /* k=1..3 j=1..3 */
    ar=real[10];ai=imag[10];
    _complex_mul_int20(ar,ai,wkreal1_16th,wkimag1_16th,real[10],imag[10]);
    fft_calctwiddle_N8(12);
    ar=real[14];ai=imag[14];
    _complex_mul_int20(ar,ai,wkreal3_16th,wkimag3_16th,real[14],imag[14]);
    fft_calctwiddle_N8(18);
    tmp=-real[20];
    real[20]=imag[20];
    imag[20]=tmp;
    fft_calctwiddle_3N8(22);
    ar=real[26];ai=imag[26];
    _complex_mul_int20(ar,ai,wkreal3_16th,wkimag3_16th,real[26],imag[26]);
    fft_calctwiddle_3N8(28);
    ar=real[30];ai=imag[30];
    _complex_mul_int20(ar,ai,wkreal9_16th,wkimag9_16th,real[30],imag[30]);
  }
}

void fft_radix4_twiddleN(int *preal,int *pimag,const int size,const int step) {
  register int tmp,i,j,k,nn,n2,w,v,u;
  register int radix,radix2,radix3;
  register int *real,*imag;
  register int *wkreal,*wkimag;
  register int ar,ai;
  register float tmpa,tmpb;

  wkreal=fft_Wk_int20;
  wkimag=fft_Wk_int20+1;
  nn=4*(1<<(step*2));
  radix=FFT_MAXSIZE/(2*nn);
  radix2=2*radix;
  radix3=3*radix;
  v=2*FFT_MAXSIZE;
  n2=nn*2;
  real=preal;
  imag=pimag;
#ifdef FFT_PRINT
  printf("twiddle step%d nn%d v%d radix%d\n",step,nn,v,radix);
#endif
  for(i=0;i<2*size;i+=(8*nn)) {
    /* k=0/j=0 is obsolete */

    /* k=1 j<N/2 */
    k=i+n2+2;
    u=radix;
    for(j=1;j<nn/2;j++) {
      ar=real[k];ai=imag[k];
      _complex_mul_int20(ar,ai,wkreal[u],wkimag[u],real[k],imag[k]);
      k+=2;
      u+=radix;
    }
    /* k=1 j==N/2 */
    fft_calctwiddle_N8(k);
    k+=2;
    u+=radix;
    /* k=1 N/2<j<N */
    for(j=1;j<nn/2;j++) {
      ar=real[k];ai=imag[k];
      _complex_mul_int20(ar,ai,wkreal[u],wkimag[u],real[k],imag[k]);
      k+=2;
      u+=radix;
    }
    k+=2;
    u=radix2;
    /* k=2 j<N/4 */
    for(j=1;j<nn/4;j++) {
      ar=real[k];ai=imag[k];
      _complex_mul_int20(ar,ai,wkreal[u],wkimag[u],real[k],imag[k]);
      k+=2;
      u+=radix2;
    }
    /* k=2 j==N/4 */
    fft_calctwiddle_N8(k);
    k+=2;
    u+=radix2;
    /* k=2 N/4<j<N/2 */
    for(j=1;j<nn/4;j++) {
      ar=real[k];ai=imag[k];
      _complex_mul_int20(ar,ai,wkreal[u],wkimag[u],real[k],imag[k]);
      k+=2;
      u+=radix2;
    }
    /* k=2 j==N/2 */
    tmp=-real[k];
    real[k]=imag[k];
    imag[k]=tmp;
    k+=2;
    u+=radix2;
    /* k=2 N/2<j<N*3/4 */
    for(j=1;j<nn/4;j++) {
      /* printf("i%d k=2 3rd off %d wk %10d %10d\n",i,(psymb-symb)/2,wkreal[u],wkimag[u]); */
      ar=real[k];ai=imag[k];
      _complex_mul_int20(ar,ai,wkreal[u],wkimag[u],real[k],imag[k]);
      k+=2;
      u+=radix2;
    }
    /* k=2 j==N*3/4 */
    fft_calctwiddle_3N8(k);
    k+=2;
    u+=radix2;
    /* k=2 N*3/4<j<N */
    for(j=1;j<nn/4;j++) {
      ar=real[k];ai=imag[k];
      _complex_mul_int20(ar,ai,wkreal[u],wkimag[u],real[k],imag[k]);
      k+=2;
      u+=radix2;
    }
    u=radix3;
    k+=2;
    /* k=3 j<N/2 */
    for(j=1;j<nn/2;j++) {
      ar=real[k];ai=imag[k];
      _complex_mul_int20(ar,ai,wkreal[u],wkimag[u],real[k],imag[k]);
      k+=2;
      u+=radix3;
      u%=v;
    }
    /* k=3 j==N/2 */
    fft_calctwiddle_3N8(k);
    k+=2;
    u+=radix3;
    u%=v;
    /* k=3 N/2<j<N */
    for(j=1;j<nn/2;j++) {
      ar=real[k];ai=imag[k];
      _complex_mul_int20(ar,ai,wkreal[u],wkimag[u],real[k],imag[k]);
      k+=2;
      u+=radix3;
      u%=v;
    }
  }
  /*
  printf("twiddle info step%d: mul%d add%d\n",step,muls,adds);
  */
}


/* Prototypes */
void _fft_radix4(int *real,int *imag,int base);

/* base is log(size)/log(4) */
void fft_radix4(int *symb,int base) {
  fftreversal_radix4(symb,base);
  _fft_radix4(symb,symb+1,base);
}

void ifft_radix4(int *symb,int base) {

  fftreversal_radix4(symb,base);
  _fft_radix4(symb+1,symb,base);
}

void _fft_radix4(int *real,int *imag,int base) {
  int i,j,size,first;

  size=(1<<(base*2)); /* 4^base */
  fft_radix4_faststep(real,imag,size,0);
  fft_radix4_twiddle(real,imag,size,0);

  for (i=1;i<base-1;i++) {
    fft_radix4_faststep(real,imag,size,i);
#ifdef FFT_PRINT_STEPS
    printf("-step %d\n",i);
    for (j=0;j<64;j++) printf("%3d %8d + ( %8d *j )\n",j,real[2*j],imag[2*j]);
#endif
    fft_radix4_twiddleN(real,imag,size,i);
#ifdef FFT_PRINT_STEPS
    printf("-twiddle %d\n",i);
    for (j=0;j<64;j++) printf("%3d %8d + ( %8d *j )\n",j,real[2*j],imag[2*j]);
#endif
  }
  fft_radix4_faststep(real,imag,size,i);
#ifdef FFT_PRINT_STEPS
  printf("-step %d\n",i);
  for (j=0;j<64;j++) printf("%3d %8d + ( %8d *j )\n",j,real[2*j],imag[2*j]);
#endif
}



