/*****************************************************************************
   Major portions of this software are copyrighted by the Medical College
   of Wisconsin, 1994-2000, and are released under the Gnu General Public
   License, Version 2.  See the file README.Copyright for details.
******************************************************************************/

#undef STANDALONE  /* Define this if you want to use this code */
                   /* outside of the AFNI package (libmri.a).  */

#ifdef STANDALONE
#  include <stdlib.h>   /* include standard system headers */
#  include <stddef.h>
#  include <stdio.h>
#  include <math.h>
   typedef struct complex { float r,i ; } complex ;  /* need this! */
#  undef USE_FFTW
#  define EXIT exit
#else
#  include "mrilib.h"   /* AFNI package library header */

#  ifdef USE_FFTW         /* 20 Oct 2000 */
#    include "fftw.h"
#  endif

#endif  /* STANDALONE */

static int use_fftw=1 ;
void csfft_use_fftw( int uf ){ use_fftw = uf; return; }

/***=====================================================================**
 *** Prototypes for externally callable functions:                       **
 ***    Complex-to-complex FFT in place:                                 **
 ***      mode = -1 or +1 (for inverse scaling, cf. csfft_scale_inverse  **
 ***      idim = dimension (power of 2, maybe with factors of 3 and 5)   **
 ***      xc   = input/output array                                      **
 ***    Re-initializes itself only when idim changes from previous call. **
 ***=====================================================================**/

void csfft_cox( int mode , int idim , complex * xc ) ;
int  csfft_nextup( int idim ) ;
int  csfft_nextup_one35( int idim ) ;
void csfft_scale_inverse( int scl ) ; /* scl=1 ==> force 1/N for mode=+1 **/
                                      /* scl=0 ==> no 1/N scaling        **/

/*** Aug 1999:                                                           **
 ***   idim can now contain factors of 3 and/or 5, up to and including   **
 ***   3^3 * 5^3.  Some examples:                                        **
 ***       135 144 150 160 180 192 200 216 225 240 250                   **
 ***   The routine csfft_nextup(n) returns the smallest size FFT         **
 ***    >= n that csfft_cox() knows how to do.                           **
 ***   Note that efficiency of the different lengths can be quite        **
 ***   different.  In general, powers of 2 are fastest, but a            **
 ***   single factor of 3 and/or 5 doesn't cause too much slowdown.      **
 ***   The routine csfft_nextup_one35(n) returns the smallest size FFT   **
 ***    >= n that contains at most one power of 3, at most one power     **
 ***   of 5, and least one power of 2.  In trials, these are the most    **
 ***   time efficient.                                                   **/

/*-- Aug 1999: routines to do FFTs by decimation by 3 or 5 --*/

static void fft_3dec( int , int , complex * ) ;
static void fft_5dec( int , int , complex * ) ;

#undef  RMAX
#define RMAX 3   /* maximum levels of recursion for radix-3 and radix-5 */

#undef  PI
#define PI (3.141592653589793238462643)

/*-------------- To order the 1/N scaling on inversion: Aug 1999 ------------*/

static int sclinv = 0 ;  /* set by csfft_scale_inverse */

void csfft_scale_inverse( int scl ){ sclinv = scl; return; }

/*-------------- For the unrolled FFT routines: November 1998 --------------*/

#ifndef DONT_UNROLL_FFTS
   static void fft8  ( int mode , complex * xc ) ; /* completely */
   static void fft16 ( int mode , complex * xc ) ; /* unrolled   */
   static void fft32 ( int mode , complex * xc ) ; /* routines   */

   static void fft64 ( int mode , complex * xc ) ; /* internal 2dec ->  32 */
   static void fft128( int mode , complex * xc ) ; /* internal 4dec ->  32 */
   static void fft256( int mode , complex * xc ) ; /* internal 4dec ->  64 */
   static void fft512( int mode , complex * xc ) ; /* internal 4dec -> 128 */

   static void fft_4dec( int , int , complex * ) ; /* general 4dec */

#  define fft1024(m,x) fft_4dec(m,1024,x)          /* 4dec -> 256 */
#  define fft2048(m,x) fft_4dec(m,2048,x)          /* 4dec -> 512 */

/*-- do FFTs of size 2 and 4 with macros --*/

#  define fft2(m,x) do{ float a,b,c,d ;                             \
                        a = x[0].r + x[1].r ; b = x[0].i + x[1].i ; \
                        c = x[0].r - x[1].r ; d = x[0].i - x[1].i ; \
                        x[0].r = a ; x[0].i = b ;                   \
                        x[1].r = c ; x[1].i = d ; } while(0)

# define USE_FFT4_MACRO
# ifndef USE_FFT4_MACRO
   static void fft4( int mode , complex * xc ) ; /* unrolled routine */
# else
#  define fft4(m,x) do{ float acpr,acmr,bdpr,bdmr, acpi,acmi,bdpi,bdmi; \
                        acpr = x[0].r + x[2].r; acmr = x[0].r - x[2].r; \
                        bdpr = x[1].r + x[3].r; bdmr = x[1].r - x[3].r; \
                        acpi = x[0].i + x[2].i; acmi = x[0].i - x[2].i; \
                        bdpi = x[1].i + x[3].i; bdmi = x[1].i - x[3].i; \
                        x[0].r = acpr+bdpr ; x[0].i = acpi+bdpi ;       \
                        x[2].r = acpr-bdpr ; x[2].i = acpi-bdpi ;       \
                        if(m > 0){                                      \
                          x[1].r = acmr-bdmi ; x[1].i = acmi+bdmr ;     \
                          x[3].r = acmr+bdmi ; x[3].i = acmi-bdmr ;     \
                        } else {                                        \
                          x[1].r = acmr+bdmi ; x[1].i = acmi-bdmr ;     \
                          x[3].r = acmr-bdmi ; x[3].i = acmi+bdmr ;     \
                        } } while(0)
# endif
#endif

/*----------------------------------------------------------------------
   Speedups with unrolled FFTs [program fftest.c]:
   Pentium II 400 MHz:  58% (32) to 36% (256)  [gcc -O3 -ffast-math]
   SGI R10000 175 MHz:  47% (32) to 18% (256)  [cc -Ofast]
   HP PA-8000 200 MHz:  58% (32) to 40% (256)  [cc +O3 +Oaggressive]
------------------------------------------------------------------------*/

/*----------------- the csfft trig constants tables ------------------*/

static complex * csplus = NULL , * csminus = NULL ;
static int nold = -666 ;

/*--------------------------------------------------------------------
   Initialize csfft trig constants table.  Adapted from AJ's code.
   Does both mode=1 and mode=-1 tables.
----------------------------------------------------------------------*/

static void csfft_trigconsts( int idim )  /* internal function */
{
   register unsigned int  m, n, i0, i1, i2, i3, k;
   register complex       *r0, *csp;
   register float         co, si, f0, f1, f2, f3, f4;
   double                 al;

   if( idim == nold ) return ;

   if( idim > nold ){
      if( csplus != NULL ){ free(csplus) ; free(csminus) ; }
      csplus  = (complex *) malloc( sizeof(complex) * idim ) ;
      csminus = (complex *) malloc( sizeof(complex) * idim ) ;
      if( csplus == NULL || csminus == NULL ){
         fprintf(stderr,"\n*** csfft cannot malloc space! ***\n"); EXIT(1) ;
      }
   }
   nold = n = idim ;

   f1 = 1.0 ;  /* csplus init */
   m  = 1; k  = 0;
   while (n > m) {
      i3 = m << 1; f2 = m; al = f1*PI/f2;
      co = cos(al); si = sin(al);
      (csplus + k)->r = 1.; (csplus + k)->i = 0.;
      for (i0=0; i0 < m; i0++) {
         k++;
         csp = csplus + k; r0 = csp - 1;
         csp->r = r0->r * co - r0->i * si;
         csp->i = r0->i * co + r0->r * si;
      }
      m = i3;
   }

   f1 = -1.0 ;  /* csminus init */
   m  = 1; k  = 0;
   while (n > m) {
      i3 = m << 1; f2 = m; al = f1*PI/f2;
      co = cos(al); si = sin(al);
      (csminus + k)->r = 1.; (csminus + k)->i = 0.;
      for (i0=0; i0 < m; i0++) {
         k++;
         csp = csminus + k; r0  = csp - 1;
         csp->r = r0->r * co - r0->i * si;
         csp->i = r0->i * co + r0->r * si;
      }
      m = i3;
   }
   return ;
}

/*--------------------------------------------------------------------
   Complex-to-complex FFT in place:
     mode = -1 or +1 (1/idim scaling on inverse [+1] if sclinv
                      was set in the call to csfft_scale_inverse)
     idim = dimension (power of 2, possibly with a factor
                       of 3^p * 5^q also, for p,q=0..RMAX)
     xc   = input/output array
   Automagically re-initializes itself when idim changes from
   previous call.  By AJ (Andrzej Jesmanowicz), modified by RWCox.
----------------------------------------------------------------------*/

/*- Macro to do 1/N scaling on inverse:
      if it is ordered by the user, and
      if mode is positive, and
      if this is not a recursive call.  -*/

#define SCLINV                                                 \
 if( sclinv && mode > 0 && rec == 0 ){                         \
   register int qq ; register float ff = 1.0 / (float) idim ;  \
   for( qq=0 ; qq < idim ; qq++ ){ xc[qq].r *= ff ; xc[qq].i *= ff ; } }

void csfft_cox( int mode , int idim , complex * xc )
{
   register unsigned int  m, n, i0, i1, i2, i3, k;
   register complex       *r0, *r1, *csp;
   register float         co, si, f0, f1, f2, f3, f4;

   static int rec=0 ;  /* recursion level */

   /*-- 20 Oct 2000: maybe use FFTW instead --*/

#ifdef USE_FFTW
   if( use_fftw ){
     static int first=1 , oldmode=0, oldidim=0 ;
     static fftw_plan fpl ;
     if( first ){
        char * env = getenv("AFNI_FFTW_WISDOM") ;
        int gotit=0 ;
        first = 0 ;
        if( env != NULL ){
           int nw = THD_filesize(env) ;
           if( nw > 0 ){
              int fd = open( env , O_RDONLY ) ;
              if( fd >= 0 ){
                 char * w = AFMALL( char, nw+4 ) ;
                 int ii = read( fd , w , nw ) ;
                 close(fd) ;
                 if( ii > 0 ){
                    w[nw] = '\0' ;
                    nw = (int) fftw_import_wisdom_from_string(w) ;
                    gotit=(nw == FFTW_SUCCESS);
                 } /* else fprintf(stderr,"csfft_cox: read fails\n"); */
                 free(w) ;
              } /* else fprintf(stderr,"csfft_cox: open fails\n"); */
           } /* else fprintf(stderr,"csfft_cox: THD_filesize(%s) fails\n",env); */
        } /* else fprintf(stderr,"csfft_cox: getenv fails\n"); */
     /* if( gotit )
          fprintf(stderr,"csfft_cox imported FFTW wisdom from file %s\n",env);
        else
          fprintf(stderr,"csfft_cox failed to import FFTW wisdom\n"); */
     }

     if( idim != oldidim || mode != oldmode ){
        if( oldidim ) fftw_destroy_plan(fpl) ;
        fpl = fftw_create_plan( idim ,
                                (mode < 0) ? FFTW_FORWARD : FFTW_BACKWARD ,
                                FFTW_ESTIMATE|FFTW_USE_WISDOM|FFTW_IN_PLACE );
        oldidim = idim ; oldmode = mode ;
     }

     fftw_one( fpl , (fftw_complex *)xc , NULL ) ;
     SCLINV ; return ;
   }
#endif /* USE_FFTW */

   /*-- November 1998: maybe use the unrolled FFT routines --*/

#ifndef DONT_UNROLL_FFTS
   switch( idim ){                                  /* none of these  */
      case    1:                           return;  /* routines will  */
      case    2: fft2   (mode,xc); SCLINV; return;  /* call csfft_cox */
      case    4: fft4   (mode,xc); SCLINV; return;  /* so don't need  */
      case    8: fft8   (mode,xc); SCLINV; return;  /* to do rec++/-- */
      case   16: fft16  (mode,xc); SCLINV; return;
      case   32: fft32  (mode,xc); SCLINV; return;
      case   64: fft64  (mode,xc); SCLINV; return;
      case  128: fft128 (mode,xc); SCLINV; return;
      case  256: fft256 (mode,xc); SCLINV; return;
      case  512: fft512 (mode,xc); SCLINV; return;
      case 1024: fft1024(mode,xc); SCLINV; return;
      case 2048: fft2048(mode,xc); SCLINV; return;

      case  4096: fft_4dec(mode, 4096,xc); SCLINV; return; /* 4dec -> 1024 */
      case  8192: fft_4dec(mode, 8192,xc); SCLINV; return; /* 4dec -> 2048 */
      case 16384: fft_4dec(mode,16384,xc); SCLINV; return; /* 4dec -> 4096 */
      case 32768: fft_4dec(mode,32768,xc); SCLINV; return; /* 4dec -> 8192 */
   }
#endif  /* end of unrollificationizing */

   /*-- Aug 1999: deal with factors of 3 or 5 [might call csfft_cox] --*/

   if( idim%3 == 0 ){rec++; fft_3dec(mode,idim,xc); rec--; SCLINV; return;}
   if( idim%5 == 0 ){rec++; fft_5dec(mode,idim,xc); rec--; SCLINV; return;}

   /*-- below here: the old general power of 2 routine --*/

   /**-- perhaps initialize --**/

   if( nold != idim ) csfft_trigconsts( idim ) ;

   /** Main loop starts here **/

   n   = idim;
   i2  = idim >> 1;
   i1  = 0;
   csp = (mode > 0) ? csplus : csminus ;  /* choose const array */

   /*-- swap data --*/

   for (i0=0; i0 < n; i0 ++) {
      if ( i1 > i0 ) {
         r0    = xc + i0; r1    = xc + i1;
         f1    = r0->r;   f2    = r0->i;
         r0->r = r1->r;   r0->i = r1->i;
         r1->r = f1;      r1->i = f2;
      }
      m = i2;
      while ( m && !(i1 < m) ) { i1 -= m; m >>= 1; }
     i1 += m;
   }

   /*-- compute compute compute --*/

   m = 1; k = 0;
   while (n > m) {
      i3 = m << 1;
      for (i0=0; i0 < m; i0 ++) {
         co = (csp + k)->r; si = (csp + k)->i;
         for (i1=i0; i1 < n; i1 += i3) {
            r0    = xc + i1;    r1    = r0 + m;
            f1    = r1->r * co; f2    = r1->i * si;
            f3    = r1->r * si; f4    = r1->i * co;
            f1   -= f2;         f3   += f4;
            f2    = r0->r;      f4    = r0->i;
            r1->r = f2 - f1;    r1->i = f4 - f3;
            r0->r = f2 + f1;    r0->i = f4 + f3;
         }
         k++;
      }
      m = i3;
   }

   SCLINV ; return ;
}

/*--------------------------------------------------------------------
   Many complex-to-complex FFTs in place [vectorized]:
     mode = -1 or +1 (NO SCALING ON INVERSE!)
     idim = dimension (power of 2)
     nvec = number of vectors to do (vectors are contiguous)
     xc   = input/output array
   Automagically re-initializes itself when idim changes from
   previous call.  By AJJesmanowicz, modified by RWCox.
----------------------------------------------------------------------*/

void csfft_many( int mode , int idim , int nvec , complex * xc )
{
   register unsigned int  m, n, i0, i1, i2, i3, k , iv ;
   register complex       *r0, *r1, *csp , *xcx;
   register float         co, si, f0, f1, f2, f3, f4;

   if( nvec == 1 ){ csfft_cox( mode , idim , xc ) ; return ; }

   if( idim % 3 == 0 ){                           /* Aug 1999 */
      for( m=0 ; m < nvec ; m++ )
         fft_3dec( mode , idim , xc + m*idim ) ;
      return ;
   } else if( idim % 5 == 0 ){
      for( m=0 ; m < nvec ; m++ )
         fft_5dec( mode , idim , xc + m*idim ) ;
      return ;
   }

   /** perhaps initialize **/

   if( nold != idim ) csfft_trigconsts( idim ) ;

   n   = idim;
   i2  = idim >> 1;
   i1  = 0;
   csp = (mode > 0) ? csplus : csminus ;  /* choose const array */

   for (i0=0; i0 < n; i0 ++) {
      if ( i1 > i0 ) {
         for( iv=0,xcx=xc ; iv < nvec ; iv++,xcx+=n ){
            r0    = xcx + i0; r1    = xcx + i1;
            f1    = r0->r;    f2    = r0->i;
            r0->r = r1->r;    r0->i = r1->i;
            r1->r = f1;       r1->i = f2;
         }
      }
      m = i2;
      while ( m && !(i1 < m) ) { i1 -= m; m >>= 1; }
     i1 += m;
   }

#define I00
#ifdef I00
#  define I0BOT 1
#else
#  define I0BOT 0
#endif

   m = 1;
   k = 0;
   while (n > m) {
      i3 = m << 1;

#ifdef I00
      /* handle i0=0 case [co=1,si=0] in special code */

      for (i1=0; i1 < n; i1 += i3) {
         for( iv=0,r0=xc+i1 ; iv < nvec ; iv++,r0+=n ){
            r1    = r0 + m;
            f1    = r1->r ;   f3    = r1->i ;
            f2    = r0->r ;   f4    = r0->i ;
            r1->r = f2 - f1 ; r1->i = f4 - f3 ;
            r0->r = f2 + f1 ; r0->i = f4 + f3 ;
         }
      }
      k++;
#endif

      for (i0=I0BOT; i0 < m; i0 ++) {
         co = (csp + k)->r; si = (csp + k)->i;
         for (i1=i0; i1 < n; i1 += i3) {
            for( iv=0,r0=xc+i1 ; iv < nvec ; iv++,r0+=n ){
               r1    = r0 + m;
#if 1
               f1    = r1->r * co - r1->i * si ;
               f3    = r1->r * si + r1->i * co ;
#else
               f1    = r1->r * co ; f2    = r1->i * si ;
               f3    = r1->r * si ; f4    = r1->i * co ;
               f1   -= f2 ;         f3   += f4 ;
#endif
               f2    = r0->r ;   f4    = r0->i ;
               r1->r = f2 - f1 ; r1->i = f4 - f3 ;
               r0->r = f2 + f1 ; r0->i = f4 + f3 ;
            }
         }
         k++;
      }
      m = i3;
   }
   return ;
}

/****************************************************************************
   Everything from here on down is for the unrolled FFT routines.
   They were generated with program fftprint.c.
*****************************************************************************/

#ifndef DONT_UNROLL_FFTS

#ifndef USE_FFT4_MACRO
/**************************************/
/* FFT routine unrolled of length   4 */

static void fft4( int mode , complex * xc )
{
   register complex * csp , * xcx=xc;
   register float f1,f2,f3,f4 ;

   /** perhaps initialize **/

   if( nold != 4 ) csfft_trigconsts( 4 ) ;

   csp = (mode > 0) ? csplus : csminus ;  /* choose const array */

   /** data swapping part **/

   f1 = xcx[1].r ; f2 = xcx[1].i ;
   xcx[1].r = xcx[2].r ; xcx[1].i = xcx[2].i ;
   xcx[2].r = f1 ; xcx[2].i = f2 ;

   /** butterflying part **/

   f1 = xcx[1].r ; f3 = xcx[1].i ;  /* cos=1 sin=0 */
   f2 = xcx[0].r ; f4 = xcx[0].i ;
   xcx[1].r = f2-f1 ; xcx[1].i = f4-f3 ;
   xcx[0].r = f2+f1 ; xcx[0].i = f4+f3 ;

   f1 = xcx[3].r ; f3 = xcx[3].i ;  /* cos=1 sin=0 */
   f2 = xcx[2].r ; f4 = xcx[2].i ;
   xcx[3].r = f2-f1 ; xcx[3].i = f4-f3 ;
   xcx[2].r = f2+f1 ; xcx[2].i = f4+f3 ;

   f1 = xcx[2].r ; f3 = xcx[2].i ;  /* cos=1 sin=0 */
   f2 = xcx[0].r ; f4 = xcx[0].i ;
   xcx[2].r = f2-f1 ; xcx[2].i = f4-f3 ;
   xcx[0].r = f2+f1 ; xcx[0].i = f4+f3 ;

   f1 = - xcx[3].i * csp[2].i ; /* cos=0 twiddles */
   f3 = xcx[3].r * csp[2].i ;
   f2 = xcx[1].r ; f4 = xcx[1].i ;
   xcx[3].r = f2-f1 ; xcx[3].i = f4-f3 ;
   xcx[1].r = f2+f1 ; xcx[1].i = f4+f3 ;

   return ;
}
#endif /* USE_FFT4_MACRO */

/**************************************/
/* FFT routine unrolled of length   8 */

static void fft8( int mode , complex * xc )
{
   register complex * csp , * xcx=xc;
   register float f1,f2,f3,f4 ;

   /** perhaps initialize **/

   if( nold != 8 ) csfft_trigconsts( 8 ) ;

   csp = (mode > 0) ? csplus : csminus ;  /* choose const array */

   /** data swapping part **/

   f1 = xcx[1].r ; f2 = xcx[1].i ;
   xcx[1].r = xcx[4].r ; xcx[1].i = xcx[4].i ;
   xcx[4].r = f1 ; xcx[4].i = f2 ;

   f1 = xcx[3].r ; f2 = xcx[3].i ;
   xcx[3].r = xcx[6].r ; xcx[3].i = xcx[6].i ;
   xcx[6].r = f1 ; xcx[6].i = f2 ;

   /** butterflying part **/

   f1 = xcx[1].r ; f3 = xcx[1].i ;  /* cos=1 sin=0 */
   f2 = xcx[0].r ; f4 = xcx[0].i ;
   xcx[1].r = f2-f1 ; xcx[1].i = f4-f3 ;
   xcx[0].r = f2+f1 ; xcx[0].i = f4+f3 ;

   f1 = xcx[3].r ; f3 = xcx[3].i ;  /* cos=1 sin=0 */
   f2 = xcx[2].r ; f4 = xcx[2].i ;
   xcx[3].r = f2-f1 ; xcx[3].i = f4-f3 ;
   xcx[2].r = f2+f1 ; xcx[2].i = f4+f3 ;

   f1 = xcx[5].r ; f3 = xcx[5].i ;  /* cos=1 sin=0 */
   f2 = xcx[4].r ; f4 = xcx[4].i ;
   xcx[5].r = f2-f1 ; xcx[5].i = f4-f3 ;
   xcx[4].r = f2+f1 ; xcx[4].i = f4+f3 ;

   f1 = xcx[7].r ; f3 = xcx[7].i ;  /* cos=1 sin=0 */
   f2 = xcx[6].r ; f4 = xcx[6].i ;
   xcx[7].r = f2-f1 ; xcx[7].i = f4-f3 ;
   xcx[6].r = f2+f1 ; xcx[6].i = f4+f3 ;

   f1 = xcx[2].r ; f3 = xcx[2].i ;  /* cos=1 sin=0 */
   f2 = xcx[0].r ; f4 = xcx[0].i ;
   xcx[2].r = f2-f1 ; xcx[2].i = f4-f3 ;
   xcx[0].r = f2+f1 ; xcx[0].i = f4+f3 ;

   f1 = xcx[6].r ; f3 = xcx[6].i ;  /* cos=1 sin=0 */
   f2 = xcx[4].r ; f4 = xcx[4].i ;
   xcx[6].r = f2-f1 ; xcx[6].i = f4-f3 ;
   xcx[4].r = f2+f1 ; xcx[4].i = f4+f3 ;

   f1 = xcx[3].r * csp[2].r - xcx[3].i * csp[2].i ; /* twiddles */
   f3 = xcx[3].r * csp[2].i + xcx[3].i * csp[2].r ;
   f2 = xcx[1].r ; f4 = xcx[1].i ;
   xcx[3].r = f2-f1 ; xcx[3].i = f4-f3 ;
   xcx[1].r = f2+f1 ; xcx[1].i = f4+f3 ;

   f1 = xcx[7].r * csp[2].r - xcx[7].i * csp[2].i ; /* twiddles */
   f3 = xcx[7].r * csp[2].i + xcx[7].i * csp[2].r ;
   f2 = xcx[5].r ; f4 = xcx[5].i ;
   xcx[7].r = f2-f1 ; xcx[7].i = f4-f3 ;
   xcx[5].r = f2+f1 ; xcx[5].i = f4+f3 ;

   f1 = xcx[4].r ; f3 = xcx[4].i ;  /* cos=1 sin=0 */
   f2 = xcx[0].r ; f4 = xcx[0].i ;
   xcx[4].r = f2-f1 ; xcx[4].i = f4-f3 ;
   xcx[0].r = f2+f1 ; xcx[0].i = f4+f3 ;

   f1 = xcx[5].r * csp[4].r - xcx[5].i * csp[4].i ; /* twiddles */
   f3 = xcx[5].r * csp[4].i + xcx[5].i * csp[4].r ;
   f2 = xcx[1].r ; f4 = xcx[1].i ;
   xcx[5].r = f2-f1 ; xcx[5].i = f4-f3 ;
   xcx[1].r = f2+f1 ; xcx[1].i = f4+f3 ;

   f1 = xcx[6].r * csp[5].r - xcx[6].i * csp[5].i ; /* twiddles */
   f3 = xcx[6].r * csp[5].i + xcx[6].i * csp[5].r ;
   f2 = xcx[2].r ; f4 = xcx[2].i ;
   xcx[6].r = f2-f1 ; xcx[6].i = f4-f3 ;
   xcx[2].r = f2+f1 ; xcx[2].i = f4+f3 ;

   f1 = xcx[7].r * csp[6].r - xcx[7].i * csp[6].i ; /* twiddles */
   f3 = xcx[7].r * csp[6].i + xcx[7].i * csp[6].r ;
   f2 = xcx[3].r ; f4 = xcx[3].i ;
   xcx[7].r = f2-f1 ; xcx[7].i = f4-f3 ;
   xcx[3].r = f2+f1 ; xcx[3].i = f4+f3 ;

   return ;
}

/**************************************/
/* FFT routine unrolled of length  16 */

static void fft16( int mode , complex * xc )
{
   register complex * csp , * xcx=xc;
   register float f1,f2,f3,f4 ;

   /** perhaps initialize **/

   if( nold != 16 ) csfft_trigconsts( 16 ) ;

   csp = (mode > 0) ? csplus : csminus ;  /* choose const array */

   /** data swapping part **/

   f1 = xcx[1].r ; f2 = xcx[1].i ;
   xcx[1].r = xcx[8].r ; xcx[1].i = xcx[8].i ;
   xcx[8].r = f1 ; xcx[8].i = f2 ;

   f1 = xcx[2].r ; f2 = xcx[2].i ;
   xcx[2].r = xcx[4].r ; xcx[2].i = xcx[4].i ;
   xcx[4].r = f1 ; xcx[4].i = f2 ;

   f1 = xcx[3].r ; f2 = xcx[3].i ;
   xcx[3].r = xcx[12].r ; xcx[3].i = xcx[12].i ;
   xcx[12].r = f1 ; xcx[12].i = f2 ;

   f1 = xcx[5].r ; f2 = xcx[5].i ;
   xcx[5].r = xcx[10].r ; xcx[5].i = xcx[10].i ;
   xcx[10].r = f1 ; xcx[10].i = f2 ;

   f1 = xcx[7].r ; f2 = xcx[7].i ;
   xcx[7].r = xcx[14].r ; xcx[7].i = xcx[14].i ;
   xcx[14].r = f1 ; xcx[14].i = f2 ;

   f1 = xcx[11].r ; f2 = xcx[11].i ;
   xcx[11].r = xcx[13].r ; xcx[11].i = xcx[13].i ;
   xcx[13].r = f1 ; xcx[13].i = f2 ;

   /** butterflying part **/

   f1 = xcx[1].r ; f3 = xcx[1].i ;  /* cos=1 sin=0 */
   f2 = xcx[0].r ; f4 = xcx[0].i ;
   xcx[1].r = f2-f1 ; xcx[1].i = f4-f3 ;
   xcx[0].r = f2+f1 ; xcx[0].i = f4+f3 ;

   f1 = xcx[3].r ; f3 = xcx[3].i ;  /* cos=1 sin=0 */
   f2 = xcx[2].r ; f4 = xcx[2].i ;
   xcx[3].r = f2-f1 ; xcx[3].i = f4-f3 ;
   xcx[2].r = f2+f1 ; xcx[2].i = f4+f3 ;

   f1 = xcx[5].r ; f3 = xcx[5].i ;  /* cos=1 sin=0 */
   f2 = xcx[4].r ; f4 = xcx[4].i ;
   xcx[5].r = f2-f1 ; xcx[5].i = f4-f3 ;
   xcx[4].r = f2+f1 ; xcx[4].i = f4+f3 ;

   f1 = xcx[7].r ; f3 = xcx[7].i ;  /* cos=1 sin=0 */
   f2 = xcx[6].r ; f4 = xcx[6].i ;
   xcx[7].r = f2-f1 ; xcx[7].i = f4-f3 ;
   xcx[6].r = f2+f1 ; xcx[6].i = f4+f3 ;

   f1 = xcx[9].r ; f3 = xcx[9].i ;  /* cos=1 sin=0 */
   f2 = xcx[8].r ; f4 = xcx[8].i ;
   xcx[9].r = f2-f1 ; xcx[9].i = f4-f3 ;
   xcx[8].r = f2+f1 ; xcx[8].i = f4+f3 ;

   f1 = xcx[11].r ; f3 = xcx[11].i ;  /* cos=1 sin=0 */
   f2 = xcx[10].r ; f4 = xcx[10].i ;
   xcx[11].r = f2-f1 ; xcx[11].i = f4-f3 ;
   xcx[10].r = f2+f1 ; xcx[10].i = f4+f3 ;

   f1 = xcx[13].r ; f3 = xcx[13].i ;  /* cos=1 sin=0 */
   f2 = xcx[12].r ; f4 = xcx[12].i ;
   xcx[13].r = f2-f1 ; xcx[13].i = f4-f3 ;
   xcx[12].r = f2+f1 ; xcx[12].i = f4+f3 ;

   f1 = xcx[15].r ; f3 = xcx[15].i ;  /* cos=1 sin=0 */
   f2 = xcx[14].r ; f4 = xcx[14].i ;
   xcx[15].r = f2-f1 ; xcx[15].i = f4-f3 ;
   xcx[14].r = f2+f1 ; xcx[14].i = f4+f3 ;

   f1 = xcx[2].r ; f3 = xcx[2].i ;  /* cos=1 sin=0 */
   f2 = xcx[0].r ; f4 = xcx[0].i ;
   xcx[2].r = f2-f1 ; xcx[2].i = f4-f3 ;
   xcx[0].r = f2+f1 ; xcx[0].i = f4+f3 ;

   f1 = xcx[6].r ; f3 = xcx[6].i ;  /* cos=1 sin=0 */
   f2 = xcx[4].r ; f4 = xcx[4].i ;
   xcx[6].r = f2-f1 ; xcx[6].i = f4-f3 ;
   xcx[4].r = f2+f1 ; xcx[4].i = f4+f3 ;

   f1 = xcx[10].r ; f3 = xcx[10].i ;  /* cos=1 sin=0 */
   f2 = xcx[8].r ; f4 = xcx[8].i ;
   xcx[10].r = f2-f1 ; xcx[10].i = f4-f3 ;
   xcx[8].r = f2+f1 ; xcx[8].i = f4+f3 ;

   f1 = xcx[14].r ; f3 = xcx[14].i ;  /* cos=1 sin=0 */
   f2 = xcx[12].r ; f4 = xcx[12].i ;
   xcx[14].r = f2-f1 ; xcx[14].i = f4-f3 ;
   xcx[12].r = f2+f1 ; xcx[12].i = f4+f3 ;

   f1 = xcx[3].r * csp[2].r - xcx[3].i * csp[2].i ; /* twiddles */
   f3 = xcx[3].r * csp[2].i + xcx[3].i * csp[2].r ;
   f2 = xcx[1].r ; f4 = xcx[1].i ;
   xcx[3].r = f2-f1 ; xcx[3].i = f4-f3 ;
   xcx[1].r = f2+f1 ; xcx[1].i = f4+f3 ;

   f1 = xcx[7].r * csp[2].r - xcx[7].i * csp[2].i ; /* twiddles */
   f3 = xcx[7].r * csp[2].i + xcx[7].i * csp[2].r ;
   f2 = xcx[5].r ; f4 = xcx[5].i ;
   xcx[7].r = f2-f1 ; xcx[7].i = f4-f3 ;
   xcx[5].r = f2+f1 ; xcx[5].i = f4+f3 ;

   f1 = xcx[11].r * csp[2].r - xcx[11].i * csp[2].i ; /* twiddles */
   f3 = xcx[11].r * csp[2].i + xcx[11].i * csp[2].r ;
   f2 = xcx[9].r ; f4 = xcx[9].i ;
   xcx[11].r = f2-f1 ; xcx[11].i = f4-f3 ;
   xcx[9].r = f2+f1 ; xcx[9].i = f4+f3 ;

   f1 = xcx[15].r * csp[2].r - xcx[15].i * csp[2].i ; /* twiddles */
   f3 = xcx[15].r * csp[2].i + xcx[15].i * csp[2].r ;
   f2 = xcx[13].r ; f4 = xcx[13].i ;
   xcx[15].r = f2-f1 ; xcx[15].i = f4-f3 ;
   xcx[13].r = f2+f1 ; xcx[13].i = f4+f3 ;

   f1 = xcx[4].r ; f3 = xcx[4].i ;  /* cos=1 sin=0 */
   f2 = xcx[0].r ; f4 = xcx[0].i ;
   xcx[4].r = f2-f1 ; xcx[4].i = f4-f3 ;
   xcx[0].r = f2+f1 ; xcx[0].i = f4+f3 ;

   f1 = xcx[12].r ; f3 = xcx[12].i ;  /* cos=1 sin=0 */
   f2 = xcx[8].r ; f4 = xcx[8].i ;
   xcx[12].r = f2-f1 ; xcx[12].i = f4-f3 ;
   xcx[8].r = f2+f1 ; xcx[8].i = f4+f3 ;

   f1 = xcx[5].r * csp[4].r - xcx[5].i * csp[4].i ; /* twiddles */
   f3 = xcx[5].r * csp[4].i + xcx[5].i * csp[4].r ;
   f2 = xcx[1].r ; f4 = xcx[1].i ;
   xcx[5].r = f2-f1 ; xcx[5].i = f4-f3 ;
   xcx[1].r = f2+f1 ; xcx[1].i = f4+f3 ;

   f1 = xcx[13].r * csp[4].r - xcx[13].i * csp[4].i ; /* twiddles */
   f3 = xcx[13].r * csp[4].i + xcx[13].i * csp[4].r ;
   f2 = xcx[9].r ; f4 = xcx[9].i ;
   xcx[13].r = f2-f1 ; xcx[13].i = f4-f3 ;
   xcx[9].r = f2+f1 ; xcx[9].i = f4+f3 ;

   f1 = xcx[6].r * csp[5].r - xcx[6].i * csp[5].i ; /* twiddles */
   f3 = xcx[6].r * csp[5].i + xcx[6].i * csp[5].r ;
   f2 = xcx[2].r ; f4 = xcx[2].i ;
   xcx[6].r = f2-f1 ; xcx[6].i = f4-f3 ;
   xcx[2].r = f2+f1 ; xcx[2].i = f4+f3 ;

   f1 = xcx[14].r * csp[5].r - xcx[14].i * csp[5].i ; /* twiddles */
   f3 = xcx[14].r * csp[5].i + xcx[14].i * csp[5].r ;
   f2 = xcx[10].r ; f4 = xcx[10].i ;
   xcx[14].r = f2-f1 ; xcx[14].i = f4-f3 ;
   xcx[10].r = f2+f1 ; xcx[10].i = f4+f3 ;

   f1 = xcx[7].r * csp[6].r - xcx[7].i * csp[6].i ; /* twiddles */
   f3 = xcx[7].r * csp[6].i + xcx[7].i * csp[6].r ;
   f2 = xcx[3].r ; f4 = xcx[3].i ;
   xcx[7].r = f2-f1 ; xcx[7].i = f4-f3 ;
   xcx[3].r = f2+f1 ; xcx[3].i = f4+f3 ;

   f1 = xcx[15].r * csp[6].r - xcx[15].i * csp[6].i ; /* twiddles */
   f3 = xcx[15].r * csp[6].i + xcx[15].i * csp[6].r ;
   f2 = xcx[11].r ; f4 = xcx[11].i ;
   xcx[15].r = f2-f1 ; xcx[15].i = f4-f3 ;
   xcx[11].r = f2+f1 ; xcx[11].i = f4+f3 ;

   f1 = xcx[8].r ; f3 = xcx[8].i ;  /* cos=1 sin=0 */
   f2 = xcx[0].r ; f4 = xcx[0].i ;
   xcx[8].r = f2-f1 ; xcx[8].i = f4-f3 ;
   xcx[0].r = f2+f1 ; xcx[0].i = f4+f3 ;

   f1 = xcx[9].r * csp[8].r - xcx[9].i * csp[8].i ; /* twiddles */
   f3 = xcx[9].r * csp[8].i + xcx[9].i * csp[8].r ;
   f2 = xcx[1].r ; f4 = xcx[1].i ;
   xcx[9].r = f2-f1 ; xcx[9].i = f4-f3 ;
   xcx[1].r = f2+f1 ; xcx[1].i = f4+f3 ;

   f1 = xcx[10].r * csp[9].r - xcx[10].i * csp[9].i ; /* twiddles */
   f3 = xcx[10].r * csp[9].i + xcx[10].i * csp[9].r ;
   f2 = xcx[2].r ; f4 = xcx[2].i ;
   xcx[10].r = f2-f1 ; xcx[10].i = f4-f3 ;
   xcx[2].r = f2+f1 ; xcx[2].i = f4+f3 ;

   f1 = xcx[11].r * csp[10].r - xcx[11].i * csp[10].i ; /* twiddles */
   f3 = xcx[11].r * csp[10].i + xcx[11].i * csp[10].r ;
   f2 = xcx[3].r ; f4 = xcx[3].i ;
   xcx[11].r = f2-f1 ; xcx[11].i = f4-f3 ;
   xcx[3].r = f2+f1 ; xcx[3].i = f4+f3 ;

   f1 = xcx[12].r * csp[11].r - xcx[12].i * csp[11].i ; /* twiddles */
   f3 = xcx[12].r * csp[11].i + xcx[12].i * csp[11].r ;
   f2 = xcx[4].r ; f4 = xcx[4].i ;
   xcx[12].r = f2-f1 ; xcx[12].i = f4-f3 ;
   xcx[4].r = f2+f1 ; xcx[4].i = f4+f3 ;

   f1 = xcx[13].r * csp[12].r - xcx[13].i * csp[12].i ; /* twiddles */
   f3 = xcx[13].r * csp[12].i + xcx[13].i * csp[12].r ;
   f2 = xcx[5].r ; f4 = xcx[5].i ;
   xcx[13].r = f2-f1 ; xcx[13].i = f4-f3 ;
   xcx[5].r = f2+f1 ; xcx[5].i = f4+f3 ;

   f1 = xcx[14].r * csp[13].r - xcx[14].i * csp[13].i ; /* twiddles */
   f3 = xcx[14].r * csp[13].i + xcx[14].i * csp[13].r ;
   f2 = xcx[6].r ; f4 = xcx[6].i ;
   xcx[14].r = f2-f1 ; xcx[14].i = f4-f3 ;
   xcx[6].r = f2+f1 ; xcx[6].i = f4+f3 ;

   f1 = xcx[15].r * csp[14].r - xcx[15].i * csp[14].i ; /* twiddles */
   f3 = xcx[15].r * csp[14].i + xcx[15].i * csp[14].r ;
   f2 = xcx[7].r ; f4 = xcx[7].i ;
   xcx[15].r = f2-f1 ; xcx[15].i = f4-f3 ;
   xcx[7].r = f2+f1 ; xcx[7].i = f4+f3 ;

   return ;
}

/**************************************/
/* FFT routine unrolled of length  32 */

static void fft32( int mode , complex * xc )
{
   register complex * csp , * xcx=xc;
   register float f1,f2,f3,f4 ;

   /** perhaps initialize **/

   if( nold != 32 ) csfft_trigconsts( 32 ) ;

   csp = (mode > 0) ? csplus : csminus ;  /* choose const array */

   /** data swapping part **/

   f1 = xcx[1].r ; f2 = xcx[1].i ;
   xcx[1].r = xcx[16].r ; xcx[1].i = xcx[16].i ;
   xcx[16].r = f1 ; xcx[16].i = f2 ;

   f1 = xcx[2].r ; f2 = xcx[2].i ;
   xcx[2].r = xcx[8].r ; xcx[2].i = xcx[8].i ;
   xcx[8].r = f1 ; xcx[8].i = f2 ;

   f1 = xcx[3].r ; f2 = xcx[3].i ;
   xcx[3].r = xcx[24].r ; xcx[3].i = xcx[24].i ;
   xcx[24].r = f1 ; xcx[24].i = f2 ;

   f1 = xcx[5].r ; f2 = xcx[5].i ;
   xcx[5].r = xcx[20].r ; xcx[5].i = xcx[20].i ;
   xcx[20].r = f1 ; xcx[20].i = f2 ;

   f1 = xcx[6].r ; f2 = xcx[6].i ;
   xcx[6].r = xcx[12].r ; xcx[6].i = xcx[12].i ;
   xcx[12].r = f1 ; xcx[12].i = f2 ;

   f1 = xcx[7].r ; f2 = xcx[7].i ;
   xcx[7].r = xcx[28].r ; xcx[7].i = xcx[28].i ;
   xcx[28].r = f1 ; xcx[28].i = f2 ;

   f1 = xcx[9].r ; f2 = xcx[9].i ;
   xcx[9].r = xcx[18].r ; xcx[9].i = xcx[18].i ;
   xcx[18].r = f1 ; xcx[18].i = f2 ;

   f1 = xcx[11].r ; f2 = xcx[11].i ;
   xcx[11].r = xcx[26].r ; xcx[11].i = xcx[26].i ;
   xcx[26].r = f1 ; xcx[26].i = f2 ;

   f1 = xcx[13].r ; f2 = xcx[13].i ;
   xcx[13].r = xcx[22].r ; xcx[13].i = xcx[22].i ;
   xcx[22].r = f1 ; xcx[22].i = f2 ;

   f1 = xcx[15].r ; f2 = xcx[15].i ;
   xcx[15].r = xcx[30].r ; xcx[15].i = xcx[30].i ;
   xcx[30].r = f1 ; xcx[30].i = f2 ;

   f1 = xcx[19].r ; f2 = xcx[19].i ;
   xcx[19].r = xcx[25].r ; xcx[19].i = xcx[25].i ;
   xcx[25].r = f1 ; xcx[25].i = f2 ;

   f1 = xcx[23].r ; f2 = xcx[23].i ;
   xcx[23].r = xcx[29].r ; xcx[23].i = xcx[29].i ;
   xcx[29].r = f1 ; xcx[29].i = f2 ;

   /** butterflying part **/

   f1 = xcx[1].r ; f3 = xcx[1].i ;  /* cos=1 sin=0 */
   f2 = xcx[0].r ; f4 = xcx[0].i ;
   xcx[1].r = f2-f1 ; xcx[1].i = f4-f3 ;
   xcx[0].r = f2+f1 ; xcx[0].i = f4+f3 ;

   f1 = xcx[3].r ; f3 = xcx[3].i ;  /* cos=1 sin=0 */
   f2 = xcx[2].r ; f4 = xcx[2].i ;
   xcx[3].r = f2-f1 ; xcx[3].i = f4-f3 ;
   xcx[2].r = f2+f1 ; xcx[2].i = f4+f3 ;

   f1 = xcx[5].r ; f3 = xcx[5].i ;  /* cos=1 sin=0 */
   f2 = xcx[4].r ; f4 = xcx[4].i ;
   xcx[5].r = f2-f1 ; xcx[5].i = f4-f3 ;
   xcx[4].r = f2+f1 ; xcx[4].i = f4+f3 ;

   f1 = xcx[7].r ; f3 = xcx[7].i ;  /* cos=1 sin=0 */
   f2 = xcx[6].r ; f4 = xcx[6].i ;
   xcx[7].r = f2-f1 ; xcx[7].i = f4-f3 ;
   xcx[6].r = f2+f1 ; xcx[6].i = f4+f3 ;

   f1 = xcx[9].r ; f3 = xcx[9].i ;  /* cos=1 sin=0 */
   f2 = xcx[8].r ; f4 = xcx[8].i ;
   xcx[9].r = f2-f1 ; xcx[9].i = f4-f3 ;
   xcx[8].r = f2+f1 ; xcx[8].i = f4+f3 ;

   f1 = xcx[11].r ; f3 = xcx[11].i ;  /* cos=1 sin=0 */
   f2 = xcx[10].r ; f4 = xcx[10].i ;
   xcx[11].r = f2-f1 ; xcx[11].i = f4-f3 ;
   xcx[10].r = f2+f1 ; xcx[10].i = f4+f3 ;

   f1 = xcx[13].r ; f3 = xcx[13].i ;  /* cos=1 sin=0 */
   f2 = xcx[12].r ; f4 = xcx[12].i ;
   xcx[13].r = f2-f1 ; xcx[13].i = f4-f3 ;
   xcx[12].r = f2+f1 ; xcx[12].i = f4+f3 ;

   f1 = xcx[15].r ; f3 = xcx[15].i ;  /* cos=1 sin=0 */
   f2 = xcx[14].r ; f4 = xcx[14].i ;
   xcx[15].r = f2-f1 ; xcx[15].i = f4-f3 ;
   xcx[14].r = f2+f1 ; xcx[14].i = f4+f3 ;

   f1 = xcx[17].r ; f3 = xcx[17].i ;  /* cos=1 sin=0 */
   f2 = xcx[16].r ; f4 = xcx[16].i ;
   xcx[17].r = f2-f1 ; xcx[17].i = f4-f3 ;
   xcx[16].r = f2+f1 ; xcx[16].i = f4+f3 ;

   f1 = xcx[19].r ; f3 = xcx[19].i ;  /* cos=1 sin=0 */
   f2 = xcx[18].r ; f4 = xcx[18].i ;
   xcx[19].r = f2-f1 ; xcx[19].i = f4-f3 ;
   xcx[18].r = f2+f1 ; xcx[18].i = f4+f3 ;

   f1 = xcx[21].r ; f3 = xcx[21].i ;  /* cos=1 sin=0 */
   f2 = xcx[20].r ; f4 = xcx[20].i ;
   xcx[21].r = f2-f1 ; xcx[21].i = f4-f3 ;
   xcx[20].r = f2+f1 ; xcx[20].i = f4+f3 ;

   f1 = xcx[23].r ; f3 = xcx[23].i ;  /* cos=1 sin=0 */
   f2 = xcx[22].r ; f4 = xcx[22].i ;
   xcx[23].r = f2-f1 ; xcx[23].i = f4-f3 ;
   xcx[22].r = f2+f1 ; xcx[22].i = f4+f3 ;

   f1 = xcx[25].r ; f3 = xcx[25].i ;  /* cos=1 sin=0 */
   f2 = xcx[24].r ; f4 = xcx[24].i ;
   xcx[25].r = f2-f1 ; xcx[25].i = f4-f3 ;
   xcx[24].r = f2+f1 ; xcx[24].i = f4+f3 ;

   f1 = xcx[27].r ; f3 = xcx[27].i ;  /* cos=1 sin=0 */
   f2 = xcx[26].r ; f4 = xcx[26].i ;
   xcx[27].r = f2-f1 ; xcx[27].i = f4-f3 ;
   xcx[26].r = f2+f1 ; xcx[26].i = f4+f3 ;

   f1 = xcx[29].r ; f3 = xcx[29].i ;  /* cos=1 sin=0 */
   f2 = xcx[28].r ; f4 = xcx[28].i ;
   xcx[29].r = f2-f1 ; xcx[29].i = f4-f3 ;
   xcx[28].r = f2+f1 ; xcx[28].i = f4+f3 ;

   f1 = xcx[31].r ; f3 = xcx[31].i ;  /* cos=1 sin=0 */
   f2 = xcx[30].r ; f4 = xcx[30].i ;
   xcx[31].r = f2-f1 ; xcx[31].i = f4-f3 ;
   xcx[30].r = f2+f1 ; xcx[30].i = f4+f3 ;

   f1 = xcx[2].r ; f3 = xcx[2].i ;  /* cos=1 sin=0 */
   f2 = xcx[0].r ; f4 = xcx[0].i ;
   xcx[2].r = f2-f1 ; xcx[2].i = f4-f3 ;
   xcx[0].r = f2+f1 ; xcx[0].i = f4+f3 ;

   f1 = xcx[6].r ; f3 = xcx[6].i ;  /* cos=1 sin=0 */
   f2 = xcx[4].r ; f4 = xcx[4].i ;
   xcx[6].r = f2-f1 ; xcx[6].i = f4-f3 ;
   xcx[4].r = f2+f1 ; xcx[4].i = f4+f3 ;

   f1 = xcx[10].r ; f3 = xcx[10].i ;  /* cos=1 sin=0 */
   f2 = xcx[8].r ; f4 = xcx[8].i ;
   xcx[10].r = f2-f1 ; xcx[10].i = f4-f3 ;
   xcx[8].r = f2+f1 ; xcx[8].i = f4+f3 ;

   f1 = xcx[14].r ; f3 = xcx[14].i ;  /* cos=1 sin=0 */
   f2 = xcx[12].r ; f4 = xcx[12].i ;
   xcx[14].r = f2-f1 ; xcx[14].i = f4-f3 ;
   xcx[12].r = f2+f1 ; xcx[12].i = f4+f3 ;

   f1 = xcx[18].r ; f3 = xcx[18].i ;  /* cos=1 sin=0 */
   f2 = xcx[16].r ; f4 = xcx[16].i ;
   xcx[18].r = f2-f1 ; xcx[18].i = f4-f3 ;
   xcx[16].r = f2+f1 ; xcx[16].i = f4+f3 ;

   f1 = xcx[22].r ; f3 = xcx[22].i ;  /* cos=1 sin=0 */
   f2 = xcx[20].r ; f4 = xcx[20].i ;
   xcx[22].r = f2-f1 ; xcx[22].i = f4-f3 ;
   xcx[20].r = f2+f1 ; xcx[20].i = f4+f3 ;

   f1 = xcx[26].r ; f3 = xcx[26].i ;  /* cos=1 sin=0 */
   f2 = xcx[24].r ; f4 = xcx[24].i ;
   xcx[26].r = f2-f1 ; xcx[26].i = f4-f3 ;
   xcx[24].r = f2+f1 ; xcx[24].i = f4+f3 ;

   f1 = xcx[30].r ; f3 = xcx[30].i ;  /* cos=1 sin=0 */
   f2 = xcx[28].r ; f4 = xcx[28].i ;
   xcx[30].r = f2-f1 ; xcx[30].i = f4-f3 ;
   xcx[28].r = f2+f1 ; xcx[28].i = f4+f3 ;

   f1 = - xcx[3].i * csp[2].i ; /* cos=0 twiddles */
   f3 = xcx[3].r * csp[2].i ;
   f2 = xcx[1].r ; f4 = xcx[1].i ;
   xcx[3].r = f2-f1 ; xcx[3].i = f4-f3 ;
   xcx[1].r = f2+f1 ; xcx[1].i = f4+f3 ;

   f1 = - xcx[7].i * csp[2].i ; /* cos=0 twiddles */
   f3 = xcx[7].r * csp[2].i ;
   f2 = xcx[5].r ; f4 = xcx[5].i ;
   xcx[7].r = f2-f1 ; xcx[7].i = f4-f3 ;
   xcx[5].r = f2+f1 ; xcx[5].i = f4+f3 ;

   f1 = - xcx[11].i * csp[2].i ; /* cos=0 twiddles */
   f3 = xcx[11].r * csp[2].i ;
   f2 = xcx[9].r ; f4 = xcx[9].i ;
   xcx[11].r = f2-f1 ; xcx[11].i = f4-f3 ;
   xcx[9].r = f2+f1 ; xcx[9].i = f4+f3 ;

   f1 = - xcx[15].i * csp[2].i ; /* cos=0 twiddles */
   f3 = xcx[15].r * csp[2].i ;
   f2 = xcx[13].r ; f4 = xcx[13].i ;
   xcx[15].r = f2-f1 ; xcx[15].i = f4-f3 ;
   xcx[13].r = f2+f1 ; xcx[13].i = f4+f3 ;

   f1 = - xcx[19].i * csp[2].i ; /* cos=0 twiddles */
   f3 = xcx[19].r * csp[2].i ;
   f2 = xcx[17].r ; f4 = xcx[17].i ;
   xcx[19].r = f2-f1 ; xcx[19].i = f4-f3 ;
   xcx[17].r = f2+f1 ; xcx[17].i = f4+f3 ;

   f1 = - xcx[23].i * csp[2].i ; /* cos=0 twiddles */
   f3 = xcx[23].r * csp[2].i ;
   f2 = xcx[21].r ; f4 = xcx[21].i ;
   xcx[23].r = f2-f1 ; xcx[23].i = f4-f3 ;
   xcx[21].r = f2+f1 ; xcx[21].i = f4+f3 ;

   f1 = - xcx[27].i * csp[2].i ; /* cos=0 twiddles */
   f3 = xcx[27].r * csp[2].i ;
   f2 = xcx[25].r ; f4 = xcx[25].i ;
   xcx[27].r = f2-f1 ; xcx[27].i = f4-f3 ;
   xcx[25].r = f2+f1 ; xcx[25].i = f4+f3 ;

   f1 = - xcx[31].i * csp[2].i ; /* cos=0 twiddles */
   f3 = xcx[31].r * csp[2].i ;
   f2 = xcx[29].r ; f4 = xcx[29].i ;
   xcx[31].r = f2-f1 ; xcx[31].i = f4-f3 ;
   xcx[29].r = f2+f1 ; xcx[29].i = f4+f3 ;

   f1 = xcx[4].r ; f3 = xcx[4].i ;  /* cos=1 sin=0 */
   f2 = xcx[0].r ; f4 = xcx[0].i ;
   xcx[4].r = f2-f1 ; xcx[4].i = f4-f3 ;
   xcx[0].r = f2+f1 ; xcx[0].i = f4+f3 ;

   f1 = xcx[12].r ; f3 = xcx[12].i ;  /* cos=1 sin=0 */
   f2 = xcx[8].r ; f4 = xcx[8].i ;
   xcx[12].r = f2-f1 ; xcx[12].i = f4-f3 ;
   xcx[8].r = f2+f1 ; xcx[8].i = f4+f3 ;

   f1 = xcx[20].r ; f3 = xcx[20].i ;  /* cos=1 sin=0 */
   f2 = xcx[16].r ; f4 = xcx[16].i ;
   xcx[20].r = f2-f1 ; xcx[20].i = f4-f3 ;
   xcx[16].r = f2+f1 ; xcx[16].i = f4+f3 ;

   f1 = xcx[28].r ; f3 = xcx[28].i ;  /* cos=1 sin=0 */
   f2 = xcx[24].r ; f4 = xcx[24].i ;
   xcx[28].r = f2-f1 ; xcx[28].i = f4-f3 ;
   xcx[24].r = f2+f1 ; xcx[24].i = f4+f3 ;

   f1 = xcx[5].r * csp[4].r - xcx[5].i * csp[4].i ; /* twiddles */
   f3 = xcx[5].r * csp[4].i + xcx[5].i * csp[4].r ;
   f2 = xcx[1].r ; f4 = xcx[1].i ;
   xcx[5].r = f2-f1 ; xcx[5].i = f4-f3 ;
   xcx[1].r = f2+f1 ; xcx[1].i = f4+f3 ;

   f1 = xcx[13].r * csp[4].r - xcx[13].i * csp[4].i ; /* twiddles */
   f3 = xcx[13].r * csp[4].i + xcx[13].i * csp[4].r ;
   f2 = xcx[9].r ; f4 = xcx[9].i ;
   xcx[13].r = f2-f1 ; xcx[13].i = f4-f3 ;
   xcx[9].r = f2+f1 ; xcx[9].i = f4+f3 ;

   f1 = xcx[21].r * csp[4].r - xcx[21].i * csp[4].i ; /* twiddles */
   f3 = xcx[21].r * csp[4].i + xcx[21].i * csp[4].r ;
   f2 = xcx[17].r ; f4 = xcx[17].i ;
   xcx[21].r = f2-f1 ; xcx[21].i = f4-f3 ;
   xcx[17].r = f2+f1 ; xcx[17].i = f4+f3 ;

   f1 = xcx[29].r * csp[4].r - xcx[29].i * csp[4].i ; /* twiddles */
   f3 = xcx[29].r * csp[4].i + xcx[29].i * csp[4].r ;
   f2 = xcx[25].r ; f4 = xcx[25].i ;
   xcx[29].r = f2-f1 ; xcx[29].i = f4-f3 ;
   xcx[25].r = f2+f1 ; xcx[25].i = f4+f3 ;

   f1 = - xcx[6].i * csp[5].i ; /* cos=0 twiddles */
   f3 = xcx[6].r * csp[5].i ;
   f2 = xcx[2].r ; f4 = xcx[2].i ;
   xcx[6].r = f2-f1 ; xcx[6].i = f4-f3 ;
   xcx[2].r = f2+f1 ; xcx[2].i = f4+f3 ;

   f1 = - xcx[14].i * csp[5].i ; /* cos=0 twiddles */
   f3 = xcx[14].r * csp[5].i ;
   f2 = xcx[10].r ; f4 = xcx[10].i ;
   xcx[14].r = f2-f1 ; xcx[14].i = f4-f3 ;
   xcx[10].r = f2+f1 ; xcx[10].i = f4+f3 ;

   f1 = - xcx[22].i * csp[5].i ; /* cos=0 twiddles */
   f3 = xcx[22].r * csp[5].i ;
   f2 = xcx[18].r ; f4 = xcx[18].i ;
   xcx[22].r = f2-f1 ; xcx[22].i = f4-f3 ;
   xcx[18].r = f2+f1 ; xcx[18].i = f4+f3 ;

   f1 = - xcx[30].i * csp[5].i ; /* cos=0 twiddles */
   f3 = xcx[30].r * csp[5].i ;
   f2 = xcx[26].r ; f4 = xcx[26].i ;
   xcx[30].r = f2-f1 ; xcx[30].i = f4-f3 ;
   xcx[26].r = f2+f1 ; xcx[26].i = f4+f3 ;

   f1 = xcx[7].r * csp[6].r - xcx[7].i * csp[6].i ; /* twiddles */
   f3 = xcx[7].r * csp[6].i + xcx[7].i * csp[6].r ;
   f2 = xcx[3].r ; f4 = xcx[3].i ;
   xcx[7].r = f2-f1 ; xcx[7].i = f4-f3 ;
   xcx[3].r = f2+f1 ; xcx[3].i = f4+f3 ;

   f1 = xcx[15].r * csp[6].r - xcx[15].i * csp[6].i ; /* twiddles */
   f3 = xcx[15].r * csp[6].i + xcx[15].i * csp[6].r ;
   f2 = xcx[11].r ; f4 = xcx[11].i ;
   xcx[15].r = f2-f1 ; xcx[15].i = f4-f3 ;
   xcx[11].r = f2+f1 ; xcx[11].i = f4+f3 ;

   f1 = xcx[23].r * csp[6].r - xcx[23].i * csp[6].i ; /* twiddles */
   f3 = xcx[23].r * csp[6].i + xcx[23].i * csp[6].r ;
   f2 = xcx[19].r ; f4 = xcx[19].i ;
   xcx[23].r = f2-f1 ; xcx[23].i = f4-f3 ;
   xcx[19].r = f2+f1 ; xcx[19].i = f4+f3 ;

   f1 = xcx[31].r * csp[6].r - xcx[31].i * csp[6].i ; /* twiddles */
   f3 = xcx[31].r * csp[6].i + xcx[31].i * csp[6].r ;
   f2 = xcx[27].r ; f4 = xcx[27].i ;
   xcx[31].r = f2-f1 ; xcx[31].i = f4-f3 ;
   xcx[27].r = f2+f1 ; xcx[27].i = f4+f3 ;

   f1 = xcx[8].r ; f3 = xcx[8].i ;  /* cos=1 sin=0 */
   f2 = xcx[0].r ; f4 = xcx[0].i ;
   xcx[8].r = f2-f1 ; xcx[8].i = f4-f3 ;
   xcx[0].r = f2+f1 ; xcx[0].i = f4+f3 ;

   f1 = xcx[24].r ; f3 = xcx[24].i ;  /* cos=1 sin=0 */
   f2 = xcx[16].r ; f4 = xcx[16].i ;
   xcx[24].r = f2-f1 ; xcx[24].i = f4-f3 ;
   xcx[16].r = f2+f1 ; xcx[16].i = f4+f3 ;

   f1 = xcx[9].r * csp[8].r - xcx[9].i * csp[8].i ; /* twiddles */
   f3 = xcx[9].r * csp[8].i + xcx[9].i * csp[8].r ;
   f2 = xcx[1].r ; f4 = xcx[1].i ;
   xcx[9].r = f2-f1 ; xcx[9].i = f4-f3 ;
   xcx[1].r = f2+f1 ; xcx[1].i = f4+f3 ;

   f1 = xcx[25].r * csp[8].r - xcx[25].i * csp[8].i ; /* twiddles */
   f3 = xcx[25].r * csp[8].i + xcx[25].i * csp[8].r ;
   f2 = xcx[17].r ; f4 = xcx[17].i ;
   xcx[25].r = f2-f1 ; xcx[25].i = f4-f3 ;
   xcx[17].r = f2+f1 ; xcx[17].i = f4+f3 ;

   f1 = xcx[10].r * csp[9].r - xcx[10].i * csp[9].i ; /* twiddles */
   f3 = xcx[10].r * csp[9].i + xcx[10].i * csp[9].r ;
   f2 = xcx[2].r ; f4 = xcx[2].i ;
   xcx[10].r = f2-f1 ; xcx[10].i = f4-f3 ;
   xcx[2].r = f2+f1 ; xcx[2].i = f4+f3 ;

   f1 = xcx[26].r * csp[9].r - xcx[26].i * csp[9].i ; /* twiddles */
   f3 = xcx[26].r * csp[9].i + xcx[26].i * csp[9].r ;
   f2 = xcx[18].r ; f4 = xcx[18].i ;
   xcx[26].r = f2-f1 ; xcx[26].i = f4-f3 ;
   xcx[18].r = f2+f1 ; xcx[18].i = f4+f3 ;

   f1 = xcx[11].r * csp[10].r - xcx[11].i * csp[10].i ; /* twiddles */
   f3 = xcx[11].r * csp[10].i + xcx[11].i * csp[10].r ;
   f2 = xcx[3].r ; f4 = xcx[3].i ;
   xcx[11].r = f2-f1 ; xcx[11].i = f4-f3 ;
   xcx[3].r = f2+f1 ; xcx[3].i = f4+f3 ;

   f1 = xcx[27].r * csp[10].r - xcx[27].i * csp[10].i ; /* twiddles */
   f3 = xcx[27].r * csp[10].i + xcx[27].i * csp[10].r ;
   f2 = xcx[19].r ; f4 = xcx[19].i ;
   xcx[27].r = f2-f1 ; xcx[27].i = f4-f3 ;
   xcx[19].r = f2+f1 ; xcx[19].i = f4+f3 ;

   f1 = - xcx[12].i * csp[11].i ; /* cos=0 twiddles */
   f3 = xcx[12].r * csp[11].i ;
   f2 = xcx[4].r ; f4 = xcx[4].i ;
   xcx[12].r = f2-f1 ; xcx[12].i = f4-f3 ;
   xcx[4].r = f2+f1 ; xcx[4].i = f4+f3 ;

   f1 = - xcx[28].i * csp[11].i ; /* cos=0 twiddles */
   f3 = xcx[28].r * csp[11].i ;
   f2 = xcx[20].r ; f4 = xcx[20].i ;
   xcx[28].r = f2-f1 ; xcx[28].i = f4-f3 ;
   xcx[20].r = f2+f1 ; xcx[20].i = f4+f3 ;

   f1 = xcx[13].r * csp[12].r - xcx[13].i * csp[12].i ; /* twiddles */
   f3 = xcx[13].r * csp[12].i + xcx[13].i * csp[12].r ;
   f2 = xcx[5].r ; f4 = xcx[5].i ;
   xcx[13].r = f2-f1 ; xcx[13].i = f4-f3 ;
   xcx[5].r = f2+f1 ; xcx[5].i = f4+f3 ;

   f1 = xcx[29].r * csp[12].r - xcx[29].i * csp[12].i ; /* twiddles */
   f3 = xcx[29].r * csp[12].i + xcx[29].i * csp[12].r ;
   f2 = xcx[21].r ; f4 = xcx[21].i ;
   xcx[29].r = f2-f1 ; xcx[29].i = f4-f3 ;
   xcx[21].r = f2+f1 ; xcx[21].i = f4+f3 ;

   f1 = xcx[14].r * csp[13].r - xcx[14].i * csp[13].i ; /* twiddles */
   f3 = xcx[14].r * csp[13].i + xcx[14].i * csp[13].r ;
   f2 = xcx[6].r ; f4 = xcx[6].i ;
   xcx[14].r = f2-f1 ; xcx[14].i = f4-f3 ;
   xcx[6].r = f2+f1 ; xcx[6].i = f4+f3 ;

   f1 = xcx[30].r * csp[13].r - xcx[30].i * csp[13].i ; /* twiddles */
   f3 = xcx[30].r * csp[13].i + xcx[30].i * csp[13].r ;
   f2 = xcx[22].r ; f4 = xcx[22].i ;
   xcx[30].r = f2-f1 ; xcx[30].i = f4-f3 ;
   xcx[22].r = f2+f1 ; xcx[22].i = f4+f3 ;

   f1 = xcx[15].r * csp[14].r - xcx[15].i * csp[14].i ; /* twiddles */
   f3 = xcx[15].r * csp[14].i + xcx[15].i * csp[14].r ;
   f2 = xcx[7].r ; f4 = xcx[7].i ;
   xcx[15].r = f2-f1 ; xcx[15].i = f4-f3 ;
   xcx[7].r = f2+f1 ; xcx[7].i = f4+f3 ;

   f1 = xcx[31].r * csp[14].r - xcx[31].i * csp[14].i ; /* twiddles */
   f3 = xcx[31].r * csp[14].i + xcx[31].i * csp[14].r ;
   f2 = xcx[23].r ; f4 = xcx[23].i ;
   xcx[31].r = f2-f1 ; xcx[31].i = f4-f3 ;
   xcx[23].r = f2+f1 ; xcx[23].i = f4+f3 ;

   f1 = xcx[16].r ; f3 = xcx[16].i ;  /* cos=1 sin=0 */
   f2 = xcx[0].r ; f4 = xcx[0].i ;
   xcx[16].r = f2-f1 ; xcx[16].i = f4-f3 ;
   xcx[0].r = f2+f1 ; xcx[0].i = f4+f3 ;

   f1 = xcx[17].r * csp[16].r - xcx[17].i * csp[16].i ; /* twiddles */
   f3 = xcx[17].r * csp[16].i + xcx[17].i * csp[16].r ;
   f2 = xcx[1].r ; f4 = xcx[1].i ;
   xcx[17].r = f2-f1 ; xcx[17].i = f4-f3 ;
   xcx[1].r = f2+f1 ; xcx[1].i = f4+f3 ;

   f1 = xcx[18].r * csp[17].r - xcx[18].i * csp[17].i ; /* twiddles */
   f3 = xcx[18].r * csp[17].i + xcx[18].i * csp[17].r ;
   f2 = xcx[2].r ; f4 = xcx[2].i ;
   xcx[18].r = f2-f1 ; xcx[18].i = f4-f3 ;
   xcx[2].r = f2+f1 ; xcx[2].i = f4+f3 ;

   f1 = xcx[19].r * csp[18].r - xcx[19].i * csp[18].i ; /* twiddles */
   f3 = xcx[19].r * csp[18].i + xcx[19].i * csp[18].r ;
   f2 = xcx[3].r ; f4 = xcx[3].i ;
   xcx[19].r = f2-f1 ; xcx[19].i = f4-f3 ;
   xcx[3].r = f2+f1 ; xcx[3].i = f4+f3 ;

   f1 = xcx[20].r * csp[19].r - xcx[20].i * csp[19].i ; /* twiddles */
   f3 = xcx[20].r * csp[19].i + xcx[20].i * csp[19].r ;
   f2 = xcx[4].r ; f4 = xcx[4].i ;
   xcx[20].r = f2-f1 ; xcx[20].i = f4-f3 ;
   xcx[4].r = f2+f1 ; xcx[4].i = f4+f3 ;

   f1 = xcx[21].r * csp[20].r - xcx[21].i * csp[20].i ; /* twiddles */
   f3 = xcx[21].r * csp[20].i + xcx[21].i * csp[20].r ;
   f2 = xcx[5].r ; f4 = xcx[5].i ;
   xcx[21].r = f2-f1 ; xcx[21].i = f4-f3 ;
   xcx[5].r = f2+f1 ; xcx[5].i = f4+f3 ;

   f1 = xcx[22].r * csp[21].r - xcx[22].i * csp[21].i ; /* twiddles */
   f3 = xcx[22].r * csp[21].i + xcx[22].i * csp[21].r ;
   f2 = xcx[6].r ; f4 = xcx[6].i ;
   xcx[22].r = f2-f1 ; xcx[22].i = f4-f3 ;
   xcx[6].r = f2+f1 ; xcx[6].i = f4+f3 ;

   f1 = xcx[23].r * csp[22].r - xcx[23].i * csp[22].i ; /* twiddles */
   f3 = xcx[23].r * csp[22].i + xcx[23].i * csp[22].r ;
   f2 = xcx[7].r ; f4 = xcx[7].i ;
   xcx[23].r = f2-f1 ; xcx[23].i = f4-f3 ;
   xcx[7].r = f2+f1 ; xcx[7].i = f4+f3 ;

   f1 = - xcx[24].i * csp[23].i ; /* cos=0 twiddles */
   f3 = xcx[24].r * csp[23].i ;
   f2 = xcx[8].r ; f4 = xcx[8].i ;
   xcx[24].r = f2-f1 ; xcx[24].i = f4-f3 ;
   xcx[8].r = f2+f1 ; xcx[8].i = f4+f3 ;

   f1 = xcx[25].r * csp[24].r - xcx[25].i * csp[24].i ; /* twiddles */
   f3 = xcx[25].r * csp[24].i + xcx[25].i * csp[24].r ;
   f2 = xcx[9].r ; f4 = xcx[9].i ;
   xcx[25].r = f2-f1 ; xcx[25].i = f4-f3 ;
   xcx[9].r = f2+f1 ; xcx[9].i = f4+f3 ;

   f1 = xcx[26].r * csp[25].r - xcx[26].i * csp[25].i ; /* twiddles */
   f3 = xcx[26].r * csp[25].i + xcx[26].i * csp[25].r ;
   f2 = xcx[10].r ; f4 = xcx[10].i ;
   xcx[26].r = f2-f1 ; xcx[26].i = f4-f3 ;
   xcx[10].r = f2+f1 ; xcx[10].i = f4+f3 ;

   f1 = xcx[27].r * csp[26].r - xcx[27].i * csp[26].i ; /* twiddles */
   f3 = xcx[27].r * csp[26].i + xcx[27].i * csp[26].r ;
   f2 = xcx[11].r ; f4 = xcx[11].i ;
   xcx[27].r = f2-f1 ; xcx[27].i = f4-f3 ;
   xcx[11].r = f2+f1 ; xcx[11].i = f4+f3 ;

   f1 = xcx[28].r * csp[27].r - xcx[28].i * csp[27].i ; /* twiddles */
   f3 = xcx[28].r * csp[27].i + xcx[28].i * csp[27].r ;
   f2 = xcx[12].r ; f4 = xcx[12].i ;
   xcx[28].r = f2-f1 ; xcx[28].i = f4-f3 ;
   xcx[12].r = f2+f1 ; xcx[12].i = f4+f3 ;

   f1 = xcx[29].r * csp[28].r - xcx[29].i * csp[28].i ; /* twiddles */
   f3 = xcx[29].r * csp[28].i + xcx[29].i * csp[28].r ;
   f2 = xcx[13].r ; f4 = xcx[13].i ;
   xcx[29].r = f2-f1 ; xcx[29].i = f4-f3 ;
   xcx[13].r = f2+f1 ; xcx[13].i = f4+f3 ;

   f1 = xcx[30].r * csp[29].r - xcx[30].i * csp[29].i ; /* twiddles */
   f3 = xcx[30].r * csp[29].i + xcx[30].i * csp[29].r ;
   f2 = xcx[14].r ; f4 = xcx[14].i ;
   xcx[30].r = f2-f1 ; xcx[30].i = f4-f3 ;
   xcx[14].r = f2+f1 ; xcx[14].i = f4+f3 ;

   f1 = xcx[31].r * csp[30].r - xcx[31].i * csp[30].i ; /* twiddles */
   f3 = xcx[31].r * csp[30].i + xcx[31].i * csp[30].r ;
   f2 = xcx[15].r ; f4 = xcx[15].i ;
   xcx[31].r = f2-f1 ; xcx[31].i = f4-f3 ;
   xcx[15].r = f2+f1 ; xcx[15].i = f4+f3 ;

   return ;
}

/*----------------------------------------------------------------
   Do a 64 FFT using fft32 and decimation-by-2
------------------------------------------------------------------*/

static void fft64( int mode , complex * xc )
{
   static complex * cs=NULL , * aa , * bb ;
   register int k , i ;
   register float akr,aki , bkr,bki , tr,ti , t1,t2 ;

   /*-- initialize cosine/sine table --*/

   if( cs == NULL ){
      double th = (PI/32.0) ;
      cs = (complex *) malloc(sizeof(complex)*32) ;
      aa = (complex *) malloc(sizeof(complex)*32) ;
      bb = (complex *) malloc(sizeof(complex)*32) ;

      cs[0].r = 1.0 ; cs[0].i = 0.0 ; /* never used */
      for( k=1 ; k < 32 ; k++ ){ cs[k].r = cos(k*th); cs[k].i = sin(k*th); }
   }

   /*-- load subarrays, and FFT each one --*/

   for( i=k=0 ; k < 32 ; k++ ){ aa[k] = xc[i++] ; bb[k] = xc[i++] ; }

   fft32( mode , aa ) ; fft32( mode , bb ) ;

   /*-- recombine: 0 and Nyquist --*/

   xc[ 0].r = aa[0].r + bb[0].r ; xc[ 0].i = aa[0].i + bb[0].i ;
   xc[32].r = aa[0].r - bb[0].r ; xc[32].i = aa[0].i - bb[0].i ;

   /*-- recombine: all others --*/

   if( mode > 0 ){
      for( k=1 ; k < 32 ; k++ ){
         bkr = bb[k].r; bki = bb[k].i;
         tr  = cs[k].r; ti  = cs[k].i;

         t1  = tr*bkr - ti*bki ; t2 = tr*bki + ti*bkr ;

         akr = aa[k].r; aki = aa[k].i;

         xc[k   ].r = akr+t1; xc[k   ].i = aki+t2;
         xc[k+32].r = akr-t1; xc[k+32].i = aki-t2;
      }
   } else {
      for( k=1 ; k < 32 ; k++ ){
         bkr = bb[k].r; bki = bb[k].i;
         tr  = cs[k].r; ti  = cs[k].i;
                                                         /* only change from */
         t1  = tr*bkr + ti*bki ; t2 = tr*bki - ti*bkr ;  /* above is with ti */

         akr = aa[k].r; aki = aa[k].i;

         xc[k   ].r = akr+t1; xc[k   ].i = aki+t2;
         xc[k+32].r = akr-t1; xc[k+32].i = aki-t2;
      }
   }

   return ;
}

/*================================================================
  Decimation by 4 routines below from 12 Aug 1999 -- RWCox
  On a 400 MHz PII, about 8% faster than 2 decimation by 2 steps.
  Added fft512, fft1024, and fft2048 also, which are 50-80%
  faster than the unrolled csfft_cox() routine on the PII --
  somewhat less speedup on SGI R10K, but still worth something.
==================================================================*/

/*----------------------------------------------------------------
   Do a 128 FFT using fft32 and decimation by 4
------------------------------------------------------------------*/

#define N  128
#define M   32
#define M2  64
#define M3  96
static void fft128( int mode , complex * xc )
{
   static complex * cs=NULL , * aa , * bb , * cc , * dd ;
   register int k , i ;
   register float aar,aai, tr,ti, bbr,bbi, ccr,cci , ddr,ddi , t1,t2 ,
                  acpr,acmr , bdpr,bdmr , acpi,acmi , bdpi,bdmi ;

   /*-- initialize cosine/sine table and memory space --*/

   if( cs == NULL ){
      double th = (2.0*PI/N) ;
      cs = (complex *) malloc(sizeof(complex)*M3) ;
      aa = (complex *) malloc(sizeof(complex)*M ) ;
      bb = (complex *) malloc(sizeof(complex)*M ) ;
      cc = (complex *) malloc(sizeof(complex)*M ) ;
      dd = (complex *) malloc(sizeof(complex)*M ) ;

      cs[0].r = 1.0 ; cs[0].i = 0.0 ;
      for( k=1 ; k < M3 ; k++ ){ cs[k].r = cos(k*th); cs[k].i = sin(k*th); }
   }

   /*-- load subarrays, and FFT each one --*/

   for( i=k=0; k < M; k++ ){
      aa[k]=xc[i++]; bb[k]=xc[i++]; cc[k]=xc[i++]; dd[k]=xc[i++];
   }

   fft32(mode,aa); fft32(mode,bb); fft32(mode,cc); fft32(mode,dd);

   /*-- recombination --*/

   if( mode > 0 ){
      for( k=0 ; k < M ; k++ ){
         t1  = bb[k].r; t2  = bb[k].i;
         tr  = cs[k].r; ti  = cs[k].i;
         bbr = tr*t1 - ti*t2  ; bbi = tr*t2 + ti*t1  ;  /* b[k]*exp(+2*Pi*k/N) */

         t1  = cc[  k].r; t2  = cc[  k].i;
         tr  = cs[2*k].r; ti  = cs[2*k].i;
         ccr = tr*t1 - ti*t2  ; cci = tr*t2 + ti*t1  ;  /* c[k]*exp(+4*Pi*k/N) */

         t1  = dd[  k].r; t2  = dd[  k].i;
         tr  = cs[3*k].r; ti  = cs[3*k].i;
         ddr = tr*t1 - ti*t2  ; ddi = tr*t2 + ti*t1  ;  /* d[k]*exp(+6*Pi*k/N) */

         aar = aa[k].r; aai = aa[k].i;                  /* a[k] */

         acpr = aar + ccr ; acmr = aar - ccr ;
         bdpr = bbr + ddr ; bdmr = bbr - ddr ;
         acpi = aai + cci ; acmi = aai - cci ;
         bdpi = bbi + ddi ; bdmi = bbi - ddi ;

         xc[k   ].r = acpr+bdpr ; xc[k   ].i = acpi+bdpi ;
         xc[k+M ].r = acmr-bdmi ; xc[k+M ].i = acmi+bdmr ;
         xc[k+M2].r = acpr-bdpr ; xc[k+M2].i = acpi-bdpi ;
         xc[k+M3].r = acmr+bdmi ; xc[k+M3].i = acmi-bdmr ;
      }
   } else {
      for( k=0 ; k < M ; k++ ){
         t1  = bb[k].r; t2  = bb[k].i;
         tr  = cs[k].r; ti  = cs[k].i;
         bbr = tr*t1 + ti*t2  ; bbi = tr*t2 - ti*t1  ;  /* b[k]*exp(-2*Pi*k/N) */

         t1  = cc[  k].r; t2  = cc[  k].i;
         tr  = cs[2*k].r; ti  = cs[2*k].i;
         ccr = tr*t1 + ti*t2  ; cci = tr*t2 - ti*t1  ;  /* c[k]*exp(-4*Pi*k/N) */

         t1  = dd[  k].r; t2  = dd[  k].i;
         tr  = cs[3*k].r; ti  = cs[3*k].i;
         ddr = tr*t1 + ti*t2  ; ddi = tr*t2 - ti*t1  ;  /* d[k]*exp(-6*Pi*k/N) */

         aar = aa[k].r; aai = aa[k].i;                  /* a[k] */

         acpr = aar + ccr ; acmr = aar - ccr ;
         bdpr = bbr + ddr ; bdmr = bbr - ddr ;
         acpi = aai + cci ; acmi = aai - cci ;
         bdpi = bbi + ddi ; bdmi = bbi - ddi ;

         xc[k   ].r = acpr+bdpr ; xc[k   ].i = acpi+bdpi ;
         xc[k+M ].r = acmr+bdmi ; xc[k+M ].i = acmi-bdmr ;
         xc[k+M2].r = acpr-bdpr ; xc[k+M2].i = acpi-bdpi ;
         xc[k+M3].r = acmr-bdmi ; xc[k+M3].i = acmi+bdmr ;
      }
   }

   return ;
}
#undef N
#undef M
#undef M2
#undef M3

/*------------------------------------------------------------------
   Do a 256 FFT using fft64 and decimation by 4
--------------------------------------------------------------------*/

#define N  256
#define M   64
#define M2 128
#define M3 192
static void fft256( int mode , complex * xc )
{
   static complex * cs=NULL , * aa , * bb , * cc , * dd ;
   register int k , i ;
   register float aar,aai, tr,ti, bbr,bbi, ccr,cci , ddr,ddi , t1,t2 ,
                  acpr,acmr , bdpr,bdmr , acpi,acmi , bdpi,bdmi ;

   /*-- initialize cosine/sine table and memory space --*/

   if( cs == NULL ){
      double th = (2.0*PI/N) ;
      cs = (complex *) malloc(sizeof(complex)*M3) ;
      aa = (complex *) malloc(sizeof(complex)*M ) ;
      bb = (complex *) malloc(sizeof(complex)*M ) ;
      cc = (complex *) malloc(sizeof(complex)*M ) ;
      dd = (complex *) malloc(sizeof(complex)*M ) ;

      cs[0].r = 1.0 ; cs[0].i = 0.0 ;
      for( k=1 ; k < M3 ; k++ ){ cs[k].r = cos(k*th); cs[k].i = sin(k*th); }
   }

   /*-- load subarrays, and FFT each one --*/

   for( i=k=0; k < M; k++ ){
      aa[k]=xc[i++]; bb[k]=xc[i++]; cc[k]=xc[i++]; dd[k]=xc[i++];
   }

   fft64(mode,aa); fft64(mode,bb); fft64(mode,cc); fft64(mode,dd);

   /*-- recombination --*/

   if( mode > 0 ){
      for( k=0 ; k < M ; k++ ){
         t1  = bb[k].r; t2  = bb[k].i;
         tr  = cs[k].r; ti  = cs[k].i;
         bbr = tr*t1 - ti*t2  ; bbi = tr*t2 + ti*t1  ;  /* b[k]*exp(+2*Pi*k/N) */

         t1  = cc[  k].r; t2  = cc[  k].i;
         tr  = cs[2*k].r; ti  = cs[2*k].i;
         ccr = tr*t1 - ti*t2  ; cci = tr*t2 + ti*t1  ;  /* c[k]*exp(+4*Pi*k/N) */

         t1  = dd[  k].r; t2  = dd[  k].i;
         tr  = cs[3*k].r; ti  = cs[3*k].i;
         ddr = tr*t1 - ti*t2  ; ddi = tr*t2 + ti*t1  ;  /* d[k]*exp(+6*Pi*k/N) */

         aar = aa[k].r; aai = aa[k].i;                  /* a[k] */

         acpr = aar + ccr ; acmr = aar - ccr ;
         bdpr = bbr + ddr ; bdmr = bbr - ddr ;
         acpi = aai + cci ; acmi = aai - cci ;
         bdpi = bbi + ddi ; bdmi = bbi - ddi ;

         xc[k   ].r = acpr+bdpr ; xc[k   ].i = acpi+bdpi ;
         xc[k+M ].r = acmr-bdmi ; xc[k+M ].i = acmi+bdmr ;
         xc[k+M2].r = acpr-bdpr ; xc[k+M2].i = acpi-bdpi ;
         xc[k+M3].r = acmr+bdmi ; xc[k+M3].i = acmi-bdmr ;
      }
   } else {
      for( k=0 ; k < M ; k++ ){
         t1  = bb[k].r; t2  = bb[k].i;
         tr  = cs[k].r; ti  = cs[k].i;
         bbr = tr*t1 + ti*t2  ; bbi = tr*t2 - ti*t1  ;  /* b[k]*exp(-2*Pi*k/N) */

         t1  = cc[  k].r; t2  = cc[  k].i;
         tr  = cs[2*k].r; ti  = cs[2*k].i;
         ccr = tr*t1 + ti*t2  ; cci = tr*t2 - ti*t1  ;  /* c[k]*exp(-4*Pi*k/N) */

         t1  = dd[  k].r; t2  = dd[  k].i;
         tr  = cs[3*k].r; ti  = cs[3*k].i;
         ddr = tr*t1 + ti*t2  ; ddi = tr*t2 - ti*t1  ;  /* d[k]*exp(-6*Pi*k/N) */

         aar = aa[k].r; aai = aa[k].i;                  /* a[k] */

         acpr = aar + ccr ; acmr = aar - ccr ;
         bdpr = bbr + ddr ; bdmr = bbr - ddr ;
         acpi = aai + cci ; acmi = aai - cci ;
         bdpi = bbi + ddi ; bdmi = bbi - ddi ;

         xc[k   ].r = acpr+bdpr ; xc[k   ].i = acpi+bdpi ;
         xc[k+M ].r = acmr+bdmi ; xc[k+M ].i = acmi-bdmr ;
         xc[k+M2].r = acpr-bdpr ; xc[k+M2].i = acpi-bdpi ;
         xc[k+M3].r = acmr-bdmi ; xc[k+M3].i = acmi+bdmr ;
      }
   }

   return ;
}
#undef N
#undef M
#undef M2
#undef M3

/*------------------------------------------------------------------
   Do a 512 FFT using fft128 and decimation by 4
--------------------------------------------------------------------*/

#define N  512
#define M  128
#define M2 256
#define M3 384
static void fft512( int mode , complex * xc )
{
   static complex * cs=NULL , * aa , * bb , * cc , * dd ;
   register int k , i ;
   register float aar,aai, tr,ti, bbr,bbi, ccr,cci , ddr,ddi , t1,t2 ,
                  acpr,acmr , bdpr,bdmr , acpi,acmi , bdpi,bdmi ;

   /*-- initialize cosine/sine table and memory space --*/

   if( cs == NULL ){
      double th = (2.0*PI/N) ;
      cs = (complex *) malloc(sizeof(complex)*M3) ;
      aa = (complex *) malloc(sizeof(complex)*M ) ;
      bb = (complex *) malloc(sizeof(complex)*M ) ;
      cc = (complex *) malloc(sizeof(complex)*M ) ;
      dd = (complex *) malloc(sizeof(complex)*M ) ;

      cs[0].r = 1.0 ; cs[0].i = 0.0 ;
      for( k=1 ; k < M3 ; k++ ){ cs[k].r = cos(k*th); cs[k].i = sin(k*th); }
   }

   /*-- load subarrays, and FFT each one --*/

   for( i=k=0; k < M; k++ ){
      aa[k]=xc[i++]; bb[k]=xc[i++]; cc[k]=xc[i++]; dd[k]=xc[i++];
   }

   fft128(mode,aa); fft128(mode,bb); fft128(mode,cc); fft128(mode,dd);

   /*-- recombination --*/

   if( mode > 0 ){
      for( k=0 ; k < M ; k++ ){
         t1  = bb[k].r; t2  = bb[k].i;
         tr  = cs[k].r; ti  = cs[k].i;
         bbr = tr*t1 - ti*t2  ; bbi = tr*t2 + ti*t1  ;  /* b[k]*exp(+2*Pi*k/N) */

         t1  = cc[  k].r; t2  = cc[  k].i;
         tr  = cs[2*k].r; ti  = cs[2*k].i;
         ccr = tr*t1 - ti*t2  ; cci = tr*t2 + ti*t1  ;  /* c[k]*exp(+4*Pi*k/N) */

         t1  = dd[  k].r; t2  = dd[  k].i;
         tr  = cs[3*k].r; ti  = cs[3*k].i;
         ddr = tr*t1 - ti*t2  ; ddi = tr*t2 + ti*t1  ;  /* d[k]*exp(+6*Pi*k/N) */

         aar = aa[k].r; aai = aa[k].i;                  /* a[k] */

         acpr = aar + ccr ; acmr = aar - ccr ;
         bdpr = bbr + ddr ; bdmr = bbr - ddr ;
         acpi = aai + cci ; acmi = aai - cci ;
         bdpi = bbi + ddi ; bdmi = bbi - ddi ;

         xc[k   ].r = acpr+bdpr ; xc[k   ].i = acpi+bdpi ;
         xc[k+M ].r = acmr-bdmi ; xc[k+M ].i = acmi+bdmr ;
         xc[k+M2].r = acpr-bdpr ; xc[k+M2].i = acpi-bdpi ;
         xc[k+M3].r = acmr+bdmi ; xc[k+M3].i = acmi-bdmr ;
      }
   } else {
      for( k=0 ; k < M ; k++ ){
         t1  = bb[k].r; t2  = bb[k].i;
         tr  = cs[k].r; ti  = cs[k].i;
         bbr = tr*t1 + ti*t2  ; bbi = tr*t2 - ti*t1  ;  /* b[k]*exp(-2*Pi*k/N) */

         t1  = cc[  k].r; t2  = cc[  k].i;
         tr  = cs[2*k].r; ti  = cs[2*k].i;
         ccr = tr*t1 + ti*t2  ; cci = tr*t2 - ti*t1  ;  /* c[k]*exp(-4*Pi*k/N) */

         t1  = dd[  k].r; t2  = dd[  k].i;
         tr  = cs[3*k].r; ti  = cs[3*k].i;
         ddr = tr*t1 + ti*t2  ; ddi = tr*t2 - ti*t1  ;  /* d[k]*exp(-6*Pi*k/N) */

         aar = aa[k].r; aai = aa[k].i;                  /* a[k] */

         acpr = aar + ccr ; acmr = aar - ccr ;
         bdpr = bbr + ddr ; bdmr = bbr - ddr ;
         acpi = aai + cci ; acmi = aai - cci ;
         bdpi = bbi + ddi ; bdmi = bbi - ddi ;

         xc[k   ].r = acpr+bdpr ; xc[k   ].i = acpi+bdpi ;
         xc[k+M ].r = acmr+bdmi ; xc[k+M ].i = acmi-bdmr ;
         xc[k+M2].r = acpr-bdpr ; xc[k+M2].i = acpi-bdpi ;
         xc[k+M3].r = acmr-bdmi ; xc[k+M3].i = acmi+bdmr ;
      }
   }

   return ;
}
#undef N
#undef M
#undef M2
#undef M3

/*------------------------------------------------------------------
   fft_4dec: do a decimation by 4 for N=1024 and 2048.
   At most RMAX levels of recursion are allowed.
--------------------------------------------------------------------*/

static void fft_4dec( int mode , int idim , complex * xc )
{
   static int rec=0 ;
   static int rmold[RMAX] = {-1,-1,-1} ;
   static complex *rcs[RMAX] = {NULL,NULL,NULL} ;
   static complex *raa[RMAX], *rbb[RMAX], *rcc[RMAX] , *rdd[RMAX] ;

   int N=idim , M=idim/4 , M2=2*M , M3=3*M ;
   int mold=-1 ;
   complex * cs , * aa , * bb , * cc , * dd ;
   register int k , i ;
   register float aar,aai, tr,ti, bbr,bbi, ccr,cci , ddr,ddi , t1,t2 ,
                  acpr,acmr , bdpr,bdmr , acpi,acmi , bdpi,bdmi ;

   /*-- initialize cosine/sine table and memory space --*/

   if( rec >= RMAX ){
      fprintf(stderr,"\n*** fft_4dec: too many recursions!\n"); EXIT(1);
   }

   mold = rmold[rec] ;  /* what was done before at this recursion level */

   if( M != mold ){
      double th = (2.0*PI/N) ;
      if( M > mold ){
         if( rcs[rec] != NULL ){
            free(rcs[rec]);free(raa[rec]);
            free(rbb[rec]);free(rcc[rec]);free(rdd[rec]);
         }
         rcs[rec] = (complex *) malloc(sizeof(complex)*M3) ;
         raa[rec] = (complex *) malloc(sizeof(complex)*M ) ;
         rbb[rec] = (complex *) malloc(sizeof(complex)*M ) ;
         rcc[rec] = (complex *) malloc(sizeof(complex)*M ) ;
         rdd[rec] = (complex *) malloc(sizeof(complex)*M ) ;
      }

      rcs[rec][0].r = 1.0 ; rcs[rec][0].i = 0.0 ;
      for( k=1 ; k < M3 ; k++ ){
         rcs[rec][k].r = cos(k*th); rcs[rec][k].i = sin(k*th);
      }
      rmold[rec] = M ;
   }

   cs = rcs[rec]; aa = raa[rec]; bb = rbb[rec]; cc = rcc[rec]; dd = rdd[rec];

   /*-- load subarrays, and FFT each one --*/

   for( i=k=0; k < M; k++ ){
      aa[k]=xc[i++]; bb[k]=xc[i++]; cc[k]=xc[i++]; dd[k]=xc[i++];
   }

   rec++ ;
   switch( N ){

      default: fprintf(stderr,"\n*** Illegal call to fft_4dec: N=%d\n",N);
               EXIT(1) ;

      case 1024: fft256(mode,aa); fft256(mode,bb);
                 fft256(mode,cc); fft256(mode,dd); break;

      case 2048: fft512(mode,aa); fft512(mode,bb);
                 fft512(mode,cc); fft512(mode,dd); break;

      case 4096: fft_4dec(mode,1024,aa); fft_4dec(mode,1024,bb);         /* recurse once */
                 fft_4dec(mode,1024,cc); fft_4dec(mode,1024,dd); break ;

      case 8192: fft_4dec(mode,2048,aa); fft_4dec(mode,2048,bb);         /* recurse once */
                 fft_4dec(mode,2048,cc); fft_4dec(mode,2048,dd); break ;

      case 16384: fft_4dec(mode,4096,aa); fft_4dec(mode,4096,bb);        /* recurse twice */
                  fft_4dec(mode,4096,cc); fft_4dec(mode,4096,dd); break ;

      case 32768: fft_4dec(mode,8192,aa); fft_4dec(mode,8192,bb);        /* recurse twice */
                  fft_4dec(mode,8192,cc); fft_4dec(mode,8192,dd); break ;
   }
   rec-- ;

   /*-- recombination --*/

   if( mode > 0 ){
      for( k=0 ; k < M ; k++ ){
         t1  = bb[k].r; t2  = bb[k].i;
         tr  = cs[k].r; ti  = cs[k].i;
         bbr = tr*t1 - ti*t2  ; bbi = tr*t2 + ti*t1  ;  /* b[k]*exp(+2*Pi*k/N) */

         t1  = cc[  k].r; t2  = cc[  k].i;
         tr  = cs[2*k].r; ti  = cs[2*k].i;
         ccr = tr*t1 - ti*t2  ; cci = tr*t2 + ti*t1  ;  /* c[k]*exp(+4*Pi*k/N) */

         t1  = dd[  k].r; t2  = dd[  k].i;
         tr  = cs[3*k].r; ti  = cs[3*k].i;
         ddr = tr*t1 - ti*t2  ; ddi = tr*t2 + ti*t1  ;  /* d[k]*exp(+6*Pi*k/N) */

         aar = aa[k].r; aai = aa[k].i;                  /* a[k] */

         acpr = aar + ccr ; acmr = aar - ccr ;
         bdpr = bbr + ddr ; bdmr = bbr - ddr ;
         acpi = aai + cci ; acmi = aai - cci ;
         bdpi = bbi + ddi ; bdmi = bbi - ddi ;

         xc[k   ].r = acpr+bdpr ; xc[k   ].i = acpi+bdpi ;
         xc[k+M ].r = acmr-bdmi ; xc[k+M ].i = acmi+bdmr ;
         xc[k+M2].r = acpr-bdpr ; xc[k+M2].i = acpi-bdpi ;
         xc[k+M3].r = acmr+bdmi ; xc[k+M3].i = acmi-bdmr ;
      }
   } else {
      for( k=0 ; k < M ; k++ ){
         t1  = bb[k].r; t2  = bb[k].i;
         tr  = cs[k].r; ti  = cs[k].i;
         bbr = tr*t1 + ti*t2  ; bbi = tr*t2 - ti*t1  ;  /* b[k]*exp(-2*Pi*k/N) */

         t1  = cc[  k].r; t2  = cc[  k].i;
         tr  = cs[2*k].r; ti  = cs[2*k].i;
         ccr = tr*t1 + ti*t2  ; cci = tr*t2 - ti*t1  ;  /* c[k]*exp(-4*Pi*k/N) */

         t1  = dd[  k].r; t2  = dd[  k].i;
         tr  = cs[3*k].r; ti  = cs[3*k].i;
         ddr = tr*t1 + ti*t2  ; ddi = tr*t2 - ti*t1  ;  /* d[k]*exp(-6*Pi*k/N) */

         aar = aa[k].r; aai = aa[k].i;                  /* a[k] */

         acpr = aar + ccr ; acmr = aar - ccr ;
         bdpr = bbr + ddr ; bdmr = bbr - ddr ;
         acpi = aai + cci ; acmi = aai - cci ;
         bdpi = bbi + ddi ; bdmi = bbi - ddi ;

         xc[k   ].r = acpr+bdpr ; xc[k   ].i = acpi+bdpi ;
         xc[k+M ].r = acmr+bdmi ; xc[k+M ].i = acmi-bdmr ;
         xc[k+M2].r = acpr-bdpr ; xc[k+M2].i = acpi-bdpi ;
         xc[k+M3].r = acmr-bdmi ; xc[k+M3].i = acmi+bdmr ;
      }
   }

   return ;
}
#endif /* DONT_UNROLL_FFTS */

/*=======================================================================
  The following radix-3 and radix-5 routines are by RWCox -- Aug 1999.
  They are not inside unrolled code.
=========================================================================*/

/*------------------------------------------------------------------
   fft_3dec: do a decimation-by-3, plus a power-of-2.
   At most RMAX levels of recursion are allowed, which means
   that at most 3**RMAX can be a factor of idim.
--------------------------------------------------------------------*/

#undef  CC3
#undef  SS3
#define CC3 (-0.5)         /* cos(2*Pi/3) */
#define SS3 (0.8660254038) /* sin(2*Pi/3) */

static void fft_3dec( int mode , int idim , complex * xc )
{
   static int rec=0 ;
   static int rmold[RMAX] = {-1,-1,-1} ;
   static complex *rcs[RMAX] = {NULL,NULL,NULL} ;
   static complex *raa[RMAX], *rbb[RMAX], *rcc[RMAX] ;

   int N=idim , M=idim/3 , M2=2*M ;
   int mold ;
   complex * cs=NULL , * aa , * bb , * cc ;
   register int k , i ;
   register float aar,aai, tr,ti, bbr,bbi, ccr,cci,
                  t1,t2,t4,t5,t6,t8 ;

   /*-- initialize cosine/sine table and memory space --*/

   if( rec >= RMAX ){
      fprintf(stderr,"\n*** fft_3dec: too many recursions!\n"); EXIT(1);
   }

   mold = rmold[rec] ;  /* what was done before at this recursion level */

   if( M != mold ){
      double th = (2.0*PI/N) ;
      if( M > mold ){
         if( rcs[rec] != NULL ){
            free(rcs[rec]);free(raa[rec]);free(rbb[rec]);free(rcc[rec]);
         }
         rcs[rec] = (complex *) malloc(sizeof(complex)*M2) ;
         raa[rec] = (complex *) malloc(sizeof(complex)*M ) ;
         rbb[rec] = (complex *) malloc(sizeof(complex)*M ) ;
         rcc[rec] = (complex *) malloc(sizeof(complex)*M ) ;
      }

      rcs[rec][0].r = 1.0 ; rcs[rec][0].i = 0.0 ;
      for( k=1 ; k < M2 ; k++ ){
         rcs[rec][k].r = cos(k*th); rcs[rec][k].i = sin(k*th);
      }
      rmold[rec] = M ;
   }

   cs = rcs[rec] ; aa = raa[rec] ; bb = rbb[rec] ; cc = rcc[rec] ;

   /*-- load subarrays, and FFT each one --*/

   for( i=k=0; k < M; k++ ){ aa[k]=xc[i++]; bb[k]=xc[i++]; cc[k]=xc[i++]; }

#ifndef DONT_UNROLL_FFTS
   switch( M ){
      case   1: break ;

      case   2: fft2  (mode,aa) ; fft2  (mode,bb) ; fft2  (mode,cc) ; break;
      case   4: fft4  (mode,aa) ; fft4  (mode,bb) ; fft4  (mode,cc) ; break;
      case   8: fft8  (mode,aa) ; fft8  (mode,bb) ; fft8  (mode,cc) ; break;
      case  16: fft16 (mode,aa) ; fft16 (mode,bb) ; fft16 (mode,cc) ; break;
      case  32: fft32 (mode,aa) ; fft32 (mode,bb) ; fft32 (mode,cc) ; break;
      case  64: fft64 (mode,aa) ; fft64 (mode,bb) ; fft64 (mode,cc) ; break;
      case 128: fft128(mode,aa) ; fft128(mode,bb) ; fft128(mode,cc) ; break;
      case 256: fft256(mode,aa) ; fft256(mode,bb) ; fft256(mode,cc) ; break;
      case 512: fft512(mode,aa) ; fft512(mode,bb) ; fft512(mode,cc) ; break;

      case 1024: fft1024(mode,aa) ; fft1024(mode,bb) ; fft1024(mode,cc) ; break;
      case 2048: fft2048(mode,aa) ; fft2048(mode,bb) ; fft2048(mode,cc) ; break;

      default:  rec++ ;
                csfft_cox(mode,M,aa) ;
                csfft_cox(mode,M,bb) ; csfft_cox(mode,M,cc) ;
                rec-- ; break ;
   }
#else
      rec++ ;
      csfft_cox(mode,M,aa) ; csfft_cox(mode,M,bb) ; csfft_cox(mode,M,cc) ;
      rec-- ;
#endif

   /*-- recombination --*/

   if( mode > 0 ){
      for( k=0 ; k < M ; k++ ){
         t1  = bb[k].r; t2  = bb[k].i;
         tr  = cs[k].r; ti  = cs[k].i;
         bbr = tr*t1  - ti*t2  ; bbi = tr*t2  + ti*t1 ; /* b[k]*exp(+2*Pi*k/N) */

         t1  = cc[  k].r; t2  = cc[  k].i;
         tr  = cs[2*k].r; ti  = cs[2*k].i;
         ccr = tr*t1  - ti*t2  ; cci = tr*t2  + ti*t1 ; /* c[k]*exp(+4*Pi*k/N) */

         t4 = bbr+ccr      ; t1 = t4*CC3       ;
         t8 = bbi+cci      ; t6 = t8*CC3       ;
         t5 = (bbr-ccr)*SS3; t2 = (bbi-cci)*SS3;

         aar = aa[k].r; aai = aa[k].i;                  /* a[k] */

         xc[k   ].r = aar+t4      ; xc[k   ].i = aai+t8      ;
         xc[k+M ].r = (aar+t1)-t2 ; xc[k+M ].i = (aai+t6)+t5 ;
         xc[k+M2].r = (aar+t1)+t2 ; xc[k+M2].i = (aai+t6)-t5 ;
      }
   } else {
      for( k=0 ; k < M ; k++ ){
         t1  = bb[k].r; t2  = bb[k].i;
         tr  = cs[k].r; ti  = cs[k].i;
         bbr = tr*t1  + ti*t2  ; bbi = tr*t2  - ti*t1 ; /* b[k]*exp(-2*Pi*k/N) */

         t1  = cc[  k].r; t2  = cc[  k].i;
         tr  = cs[2*k].r; ti  = cs[2*k].i;
         ccr = tr*t1  + ti*t2  ; cci = tr*t2  - ti*t1 ; /* c[k]*exp(-4*Pi*k/N) */

         t4 = bbr+ccr      ; t1 = t4*CC3       ;
         t8 = bbi+cci      ; t6 = t8*CC3       ;
         t5 = (bbr-ccr)*SS3; t2 = (bbi-cci)*SS3;

         aar = aa[k].r; aai = aa[k].i;                  /* a[k] */

         xc[k   ].r = aar+t4      ; xc[k   ].i = aai+t8      ;
         xc[k+M ].r = (aar+t1)+t2 ; xc[k+M ].i = (aai+t6)-t5 ;
         xc[k+M2].r = (aar+t1)-t2 ; xc[k+M2].i = (aai+t6)+t5 ;
      }
   }

   return ;
}

/*------------------------------------------------------------------
   fft_5dec: do a decimation-by-5, plus a power-of-2.
   At most RMAX levels of recursion are allowed, which means
   that at most 5**RMAX can be a factor of idim.
--------------------------------------------------------------------*/

#undef  COS72
#undef  SIN72
#define COS72  0.30901699   /* cos(72 deg) */
#define SIN72  0.95105652   /* sin(72 deg) */

static void fft_5dec( int mode , int idim , complex * xc )
{
   static int rec=0 ;
   static int rmold[RMAX] = {-1,-1,-1} ;
   static complex *rcs[RMAX] = {NULL,NULL,NULL} ;
   static complex *raa[RMAX], *rbb[RMAX], *rcc[RMAX], *rdd[RMAX], *ree[RMAX] ;

   int N=idim , M=idim/5 , M2=2*M , M3=3*M , M4=4*M ;
   int mold ;
   complex * cs, *aa, *bb, *cc, *dd, *ee ;
   register int k , i ;
   register float aar,aai,bbr,bbi,ccr,cci,ddr,ddi,eer,eei ;
   register float akp,akm,bkp,bkm , ajp,ajm,bjp,bjm ;
   register float ak,bk,aj,bj ;
   float c72 , s72 , c2,s2 ;
   int ss ;

   /*-- initialize cosine/sine table and memory space --*/

   if( rec >= RMAX ){
      fprintf(stderr,"\n*** fft_5dec: too many recursions!\n"); EXIT(1);
   }

   mold = rmold[rec] ;  /* what was done before at this recursion level */

   if( M != mold ){     /* new value of M? */
      double th = (2.0*PI/N) ;
      if( M > mold ){             /* need more space */
         if( rcs[rec] != NULL ){
            free(rcs[rec]);free(raa[rec]);free(rbb[rec]);
            free(rcc[rec]);free(rdd[rec]);free(ree[rec]);
         }
         rcs[rec] = (complex *) malloc(sizeof(complex)*M4) ;
         raa[rec] = (complex *) malloc(sizeof(complex)*M ) ;
         rbb[rec] = (complex *) malloc(sizeof(complex)*M ) ;
         rcc[rec] = (complex *) malloc(sizeof(complex)*M ) ;
         rdd[rec] = (complex *) malloc(sizeof(complex)*M ) ;
         ree[rec] = (complex *) malloc(sizeof(complex)*M ) ;
      }

      rcs[rec][0].r = 1.0 ; rcs[rec][0].i = 0.0 ;
      for( k=1 ; k < M4 ; k++ ){
         rcs[rec][k].r = cos(k*th); rcs[rec][k].i = sin(k*th);
      }
      rmold[rec] = M ;  /* you must remember this */
   }

   cs = rcs[rec] ; aa = raa[rec] ; bb = rbb[rec] ;
                   cc = rcc[rec] ; dd = rdd[rec] ; ee = ree[rec] ;

   /*-- load subarrays, and FFT each one --*/

   for( i=k=0; k < M; k++ ){
      aa[k]=xc[i++]; bb[k]=xc[i++]; cc[k]=xc[i++]; dd[k]=xc[i++]; ee[k]=xc[i++];
   }

#ifndef DONT_UNROLL_FFTS
   switch( M ){
      case   1: break ;

      case   2: fft2  (mode,aa) ; fft2  (mode,bb) ; fft2  (mode,cc) ;
                fft2  (mode,dd) ; fft2  (mode,ee) ;                   break;

      case   4: fft4  (mode,aa) ; fft4  (mode,bb) ; fft4  (mode,cc) ;
                fft4  (mode,dd) ; fft4  (mode,ee) ;                   break;

      case   8: fft8  (mode,aa) ; fft8  (mode,bb) ; fft8  (mode,cc) ;
                fft8  (mode,dd) ; fft8  (mode,ee) ;                   break;

      case  16: fft16 (mode,aa) ; fft16 (mode,bb) ; fft16 (mode,cc) ;
                fft16 (mode,dd) ; fft16 (mode,ee) ;                   break;

      case  32: fft32 (mode,aa) ; fft32 (mode,bb) ; fft32 (mode,cc) ;
                fft32 (mode,dd) ; fft32 (mode,ee) ;                   break;

      case  64: fft64 (mode,aa) ; fft64 (mode,bb) ; fft64 (mode,cc) ;
                fft64 (mode,dd) ; fft64 (mode,ee) ;                   break;

      case 128: fft128(mode,aa) ; fft128(mode,bb) ; fft128(mode,cc) ;
                fft128(mode,dd) ; fft128(mode,ee) ;                   break;

      case 256: fft256(mode,aa) ; fft256(mode,bb) ; fft256(mode,cc) ;
                fft256(mode,dd) ; fft256(mode,ee) ;                   break;

      case 512: fft512(mode,aa) ; fft512(mode,bb) ; fft512(mode,cc) ;
                fft512(mode,dd) ; fft512(mode,ee) ;                   break;

      case 1024: fft1024(mode,aa) ; fft1024(mode,bb) ; fft1024(mode,cc) ;
                 fft1024(mode,dd) ; fft1024(mode,ee) ;                   break;

      case 2048: fft2048(mode,aa) ; fft2048(mode,bb) ; fft2048(mode,cc) ;
                 fft2048(mode,dd) ; fft2048(mode,ee) ;                   break;

      default:  rec++ ;
                csfft_cox(mode,M,aa) ;
                csfft_cox(mode,M,bb) ; csfft_cox(mode,M,cc) ;
                csfft_cox(mode,M,dd) ; csfft_cox(mode,M,ee) ;
                rec-- ; break ;
   }
#else
      rec++ ;
      csfft_cox(mode,M,aa) ; csfft_cox(mode,M,bb) ; csfft_cox(mode,M,cc) ;
      csfft_cox(mode,M,dd) ; csfft_cox(mode,M,ee) ;
      rec-- ;
#endif

   /*-- recombination --*/

   if( mode > 0 ){ s72 =  SIN72 ; ss =  1 ; }
   else          { s72 = -SIN72 ; ss = -1 ; }

   c72 = COS72 ;
   c2  = c72 * c72 - s72 * s72 ;
   s2  = 2.0 * c72 * s72 ;

   for( k=0 ; k < M ; k++ ){
      ak  = bb[k].r ; bk  = bb[k].i      ;
      aj  = cs[k].r ; bj  = cs[k].i * ss ;
      bbr = aj*ak - bj*bk ; bbi = aj*bk + bj*ak  ;  /* b[k]*exp(ss*2*Pi*k/N) */

      ak  = cc[  k].r ; bk  = cc[  k].i      ;
      aj  = cs[2*k].r ; bj  = cs[2*k].i * ss ;
      ccr = aj*ak - bj*bk ; cci = aj*bk + bj*ak  ;  /* c[k]*exp(ss*4*Pi*k/N) */

      ak  = dd[  k].r ; bk  = dd[  k].i      ;
      aj  = cs[3*k].r ; bj  = cs[3*k].i * ss ;
      ddr = aj*ak - bj*bk ; ddi = aj*bk + bj*ak  ;  /* d[k]*exp(ss*6*Pi*k/N) */

      ak  = ee[  k].r ; bk  = ee[  k].i      ;
      aj  = cs[4*k].r ; bj  = cs[4*k].i * ss ;
      eer = aj*ak - bj*bk ; eei = aj*bk + bj*ak  ;  /* e[k]*exp(ss*8*Pi*k/N) */

      aar = aa[k].r ; aai = aa[k].i ;               /* a[k] */

      /* the code below is (heavily) adapted from fftn.c */

      akp = bbr + eer ; akm = bbr - eer ;
      bkp = bbi + eei ; bkm = bbi - eei ;
      ajp = ccr + ddr ; ajm = ccr - ddr ;
      bjp = cci + ddi ; bjm = cci - ddi ;

      xc[k].r = aar + akp + ajp ; xc[k].i = aai + bkp + bjp ;

      ak = akp * c72 + ajp * c2 + aar ; bk = bkp * c72 + bjp * c2 + aai ;
      aj = akm * s72 + ajm * s2       ; bj = bkm * s72 + bjm * s2       ;

      xc[k+M ].r = ak - bj ; xc[k+M ].i = bk + aj ;
      xc[k+M4].r = ak + bj ; xc[k+M4].i = bk - aj ;

      ak = akp * c2 + ajp * c72 + aar ; bk = bkp * c2 + bjp * c72 + aai ;
      aj = akm * s2 - ajm * s72       ; bj = bkm * s2 - bjm * s72       ;

      xc[k+M2].r = ak - bj ; xc[k+M2].i = bk + aj ;
      xc[k+M3].r = ak + bj ; xc[k+M3].i = bk - aj ;
   }

   return ;
}

/*--------------------------------------------------------------------
   Return the smallest integer >= idim for which we can do an FFT.
----------------------------------------------------------------------*/

#define N35 ((RMAX+1)*(RMAX+1))

int csfft_nextup( int idim )
{
   static int * tf = NULL , * dn = NULL ;
   int ibase , ii ;

   /*-- build table of allowable powers of 3 and 5 [tf],
        the powers of 2 just less than them        [dn],
        and their ratios tf/dn                     [rat].
        Then sort tf and dn to be increasing in rat.     --*/

   if( tf == NULL ){
      int p , q , tt,ff , i=0 , j ; float * rat ; float r ;

      tf  = (int *)   malloc(sizeof(int)  *N35) ;
      dn  = (int *)   malloc(sizeof(int)  *N35) ;
      rat = (float *) malloc(sizeof(float)*N35) ;

      /* create tables */

      for( p=0,tt=1 ; p <= RMAX ; p++,tt*=3 ){         /* tt = 3^p */
         for( q=0,ff=1 ; q <= RMAX ; q++,ff*=5,i++ ){  /* ff = 5^q */
            tf[i] = tt * ff ;

            j=2; while( j < tf[i] ){ j*=2; } /* j = power of 2 just >= tf */
            dn[i] = j/2 ;                    /* dn = power of 2 just < tf */

            rat[i] = (float)(tf[i]) / (float)(dn[i]) ;
         }
      }

      /* sort on rat (crude, but it works) */

      do{
         for( i=1,p=0 ; i < N35 ; i++ ){
            if( rat[i-1] > rat[i] ){
               r = rat[i-1] ; rat[i-1] = rat[i] ; rat[i] = r ;
               q = tf [i-1] ; tf [i-1] = tf [i] ; tf [i] = q ;
               q = dn [i-1] ; dn [i-1] = dn [i] ; dn [i] = q ;
               p++ ;
            }
         }
      } while( p > 0 ) ;

      free(rat) ;
   }

   /*-- loop on powers of 2 (ibase);
        we can do FFTs of size = tf*ibase/dn (note 1 < tf/dn < 2);
        sinc tf/dn is sorted, we're scanning in increasing sizes  --*/

   ibase = 1 ;
   while(1){
      if( idim <= ibase ) return ibase ;

      for( ii=0 ; ii < N35 ; ii++ )
         if( dn[ii] <= ibase && idim <= tf[ii]*ibase/dn[ii] )
            return tf[ii]*ibase/dn[ii] ;

      ibase *= 2 ;
   }
}

/*----------------------------------------------------------------------
   return the next legal value that has only 1 power of 3 and/or 5,
   and that is also even
------------------------------------------------------------------------*/

int csfft_nextup_one35( int idim )
{
   int jj = idim ;
   do{
      jj = csfft_nextup(jj) ;
      if( jj%9 == 0 || jj%25 == 0 || jj%2 == 1 ) jj++ ;
      else                                       return jj ;
   } while(1) ;
   return 0 ; /* cannot be reached */
}

/*----------------------------------------------------------------------
   return the next legal value that is even [13 May 2003 - RWCox]
------------------------------------------------------------------------*/

int csfft_nextup_even( int idim )
{
   int jj = idim ;
   do{
      jj = csfft_nextup(jj) ;
      if( jj%2 == 1 ) jj++ ;
      else            return jj ;
   } while(1) ;
   return 0 ; /* cannot be reached */
}