#include "mrilib.h"

/*** This file is intended to be #include-d into another source file ***/

#include <time.h>
#include <sys/types.h>
#include <unistd.h>

#ifdef USE_OMP
# include <omp.h>
#endif

/*** Thread-safe FFT functions in csfft_OMP.c are used ***/

#include "csfft_OMP.c"
#define CFFT       csfft_cox_OMP
#define CFFT_setup csfft_cox_OMP_SETUP()

/*----------------------------------------------------------------------------*/
/* What it says. The dimensions are assumed to be compatible with csfft_cox().
   The array tar, if not NULL, must be at least as long as MAX(ny,nz).
*//*--------------------------------------------------------------------------*/

static void mri_fft3D_inplace( MRI_IMAGE *fim , complex *tar )
{
   complex *far , *qar ;
   int nx,ny,nz,nxy, ii,jj,kk , nn,qq ;

   if( fim == NULL || fim->kind != MRI_complex ) return ;
   far = MRI_COMPLEX_PTR(fim) ;
   if( far == NULL )                             return ;

   nx = fim->nx ; ny = fim->ny ; nz = fim->nz ; nxy = nx*ny ;

   /* x FFTs */

   CFFT_setup ;  /* for the OMP version of csfft,
                    which allows csfft_cox_OMP to be called inside a thread */

   for( qar=far,kk=0 ; kk < nz ; kk++ ){
    for( jj=0 ; jj < ny ; jj++ , qar+=nx ){
      CFFT( -1 , nx , qar ) ;
   }}

   /* y FFTs */

   if( tar == NULL ){  /* create temp space */
     qar = (complex *)malloc(sizeof(complex)*MAX(ny,nz)) ;
   } else {
     qar = tar ;       /* user-supplied temp space */
   }

   for( kk=0 ; kk < nz ; kk++ ){
    for( ii=0 ; ii < nx ; ii++ ){
      qq = ii+kk*nxy ;
      for( jj=0 ; jj < ny ; jj++ ) qar[jj] = far[qq+jj*nx] ;
      CFFT( -1 , ny , qar ) ;
      for( jj=0 ; jj < ny ; jj++ ) far[qq+jj*nx] = qar[jj] ;
   }}

   /* z FFTs */

   for( jj=0 ; jj < ny ; jj++ ){
    for( ii=0 ; ii < nx ; ii++ ){
      qq = ii+jj*nx ;
      for( kk=0 ; kk < nz ; kk++ ) qar[kk] = far[qq+kk*nxy] ;
      CFFT( -1 , nz , qar ) ;
      for( kk=0 ; kk < nz ; kk++ ) far[qq+kk*nxy] = qar[kk] ;
   }}

   if( tar == NULL ) free(qar) ;  /* did we create this? */
   return ;
}

/*----------------------------------------------------------------------------*/
/* Given a field size and a kernel function radius,
   determine the grid size for the FFTs to perform, including
   some buffering for edge effects of the kernel function.
   Output grid sizes are even and compatible with csfft_cox().
*//*--------------------------------------------------------------------------*/

static int_triple get_convo_field_size( int   nx, int   ny, int   nz,
                                        float dx, float dy, float dz, float rr )
{
   int_triple ijk = { 0,0,0 } ;
   int qq , xx,yy,zz ;

   if( rr <= 0.0f ) return ijk ;

   xx = nx + 2*(int)ceilf(rr/dx) ;  /* expand for this buffer */
   yy = ny + 2*(int)ceilf(rr/dy) ;
   zz = nz + 2*(int)ceilf(rr/dz) ;
   if( xx < 16 ) xx = 16 ;
   if( yy < 16 ) yy = 16 ;
   if( zz < 16 ) zz = 16 ;
   xx = csfft_nextup_one35(xx) ;    /* expand for FFT allowable sizes */
   yy = csfft_nextup_one35(yy) ;    /* (even with at most one) */
   zz = csfft_nextup_one35(zz) ;    /* (factor of 3 and/or 5,) */
                                    /* (for efficient FFT-ing) */

   ijk.i = xx ; ijk.j = yy ; ijk.k = zz ; return ijk ;
}

/*----------------------------------------------------------------------------*/
/* Create the kernel smoothing function, then FFT it, and clip/shrink it
   as far as reasonable.  The basic method for convolving an input 3D
   field is then
     (1) copy it to a larger buffered volume, then FFT it
     (2) multiply this field by the radial weight function from here
     (3) FFT back to real space
     (4) truncate back to desired 3D grid
   Note that nx,ny,nz must be compatible with csfft_cox(), which
   will be easiest if get_convo_field_size() is used to set them.
*//*--------------------------------------------------------------------------*/

/* macros for 3D array access */

#undef  WAR
#define WAR(i,j,k) war[(i)+(j)*nx+(k)*nxy]
#undef  QAR
#define QAR(i,j,k) qar[(i)+(j)*nxh+(k)*nxyh]
#undef  FAR
#define FAR(i,j,k) far[(i)+(j)*itop+(k)*ijtop]

static MRI_IMAGE * make_radial_convo_weight( int   nx , int   ny , int   nz ,
                                             float dx , float dy , float dz ,
                                             float (*kerfun)(float) , float rr )
{
   MRI_IMAGE *wim ,       *fim , *qim ;
   complex   *war ; float *far , *qar ;
   int nxy=nx*ny , ii,jj,kk , nxh=nx/2,nyh=ny/2,nzh=nz/2 , nxyh=nxh*nyh ;
   int itop,jtop,ktop,ijtop ;
   float xx,yy,zz , ftop ;

   if( kerfun == NULL ) return NULL ;

   /* if radius not provided, make it up */

   if( rr <= 0.0f ){
     float *kar ;
     itop = MAX(nx,ny) ; itop = MAX(itop,nz) ;
     xx   = MAX(dx,dy) ; xx   = MAX(xx,dz) ;
     kar  = (float *)malloc(sizeof(float)*itop) ; ftop = 0.0f ;
     for( ii=0 ; ii < itop ; ii++ ){
       kar[ii] = fabsf(kerfun(ii*xx)) ; if( ftop < kar[ii] ) ftop = kar[ii] ;
     }
     if( ftop == 0.0f ){ free(kar) ; return NULL ; }
     ftop *= 0.0001f ;
     for( ii=itop-1 ; ii > 0 && kar[ii] < ftop ; ii-- ) ; /* nada */
     free(kar) ;
     rr = (ii+1)*xx ;
   }

   /* setup kernel in real space */

   wim = mri_new_vol( nx,ny,nz , MRI_complex ) ;
   war = MRI_COMPLEX_PTR(wim) ;

   /* fill the array with the kernel in real space */

   for( kk=0 ; kk < nz ; kk++ ){
     zz = (kk < nzh) ? kk*dz : (nz-kk)*dz ;
     for( jj=0 ; jj < ny ; jj++ ){
       yy = (jj < nyh) ? jj*dy : (ny-jj)*dy ;
       for( ii=0 ; ii < nx ; ii++ ){
         if( ii==nxh || jj==nyh || kk==nzh ){  /* Nyquist */
           WAR(ii,jj,kk) = CMPLX(0.0f,0.0f) ;
         } else {
           xx = (ii < nxh) ? ii*dx : (nx-ii)*dx ;
           rr = kerfun( sqrtf(xx*xx+yy*yy+zz*zz) ) ;
           WAR(ii,jj,kk) = CMPLX(rr,0.0f) ;
         }
       }
     }
   }

   /* go to FFT space */

   mri_fft3D_inplace( wim , NULL ) ;

   /* create clipped output */

   qim = mri_new_vol( nxh,nyh,nzh , MRI_float ) ;
   qar = MRI_FLOAT_PTR(qim) ;

   ftop = 0.00001f * fabsf(war[0].r) ;  /* largest value to keep */

   /* half size in each direction since is symmetric */
   /* note we compute sqrt(FFT(wim)) since we are creating the
      weight for the noise
      -- which is effectively squared when the ACF is computed/estimated */

   for( kk=0 ; kk < nzh ; kk++ ){
    for( jj=0 ; jj < nyh ; jj++ ){
     for( ii=0 ; ii < nxh ; ii++ ){
       rr = WAR(ii,jj,kk).r ;       /* imag part should be very small */
       if( rr < ftop ) rr = 0.0f ; else rr = sqrtf(rr) ; /* clip here */
       QAR(ii,jj,kk) = rr ;                            /* load output */
   }}}

   mri_free(wim) ;  /* done with this */

   /* shrink it in space, if possible */

   MRI_autobbox( qim , NULL,&itop , NULL,&jtop , NULL,&ktop ) ;

   if( itop < nxh-1 ) itop++ ;
   if( jtop < nyh-1 ) jtop++ ;
   if( ktop < nzh-1 ) ktop++ ;
   ijtop = itop*jtop ;
   fim = mri_new_vol( itop , jtop , ktop , MRI_float ) ;
   far = MRI_FLOAT_PTR(fim) ;
   ININFO_message("Kernel image dimensions %d x %d x %d",itop,jtop,ktop) ;

   for( kk=0 ; kk < ktop ; kk++ ){
    for( jj=0 ; jj < jtop ; jj++ ){
     for( ii=0 ; ii < itop ; ii++ ){
       FAR(ii,jj,kk) = QAR(ii,jj,kk) ;
   }}}

   mri_free(qim) ; return fim ;
}

#undef WAR
#undef QAR
#undef FAR

/*----------------------------------------------------------------------------*/
/* Convolve with a pre-computed kernel via FFT.
     nx,ny,nz = dimensions of output
     wtim     = weight image in FFT space
     tar      = FFT workspace - at least MAX(ny,nz) long (or NULL)
*//*--------------------------------------------------------------------------*/

/* more macros for 3D array access */

#undef  CXAR
#define CXAR(i,j,k) cxar[(i)+(j)*nx+(k)*nxy]

#undef  WTAR
#define WTAR(i,j,k) wtar[(i)+(j)*nxw+(k)*nxyw]

#undef  CXRAN
#define CXRAN(i,j,k) \
 ( zr=ww*zgaussian_sss(xran), zi=ww*zgaussian_sss(xran), CXAR(i,j,k)=CMPLX(zr,zi) )

static void mri_radial_convo( MRI_IMAGE *fim , MRI_IMAGE *wtim , complex *tar )
{
   MRI_IMAGE *cxim ; complex *cxar ;
   MRI_IMARR *outar ;
   float     *wtar ;
   int nxy , nxh,nyh,nzh , nxw,nyw,nzw,nxyw , ii,jj,kk ;
   float ww , zr, zi ;

   cxim = mri_new_vol( nx,ny,nz , MRI_complex ) ;
   cxar = MRI_COMPLEX_PTR(cxim) ;
   wtar = MRI_FLOAT_PTR(wtim) ;
   nxw  = wtim->nx ; nyw = wtim->ny ; nzw = wtim->nz ; nxyw = nxw*nyw ;
   nxy  = nx*ny ;

   /* fill random values */

   for( kk=1 ; kk < nzw ; kk++ ){     /* interior grid */
    for( jj=1 ; jj < nyw ; jj++ ){
     for( ii=1 ; ii < nxw ; ii++ ){
       ww = WTAR(ii,jj,kk) ;
       if( ww != 0.0f ){  /* fill all 8 reflections */
         CXRAN(ii,   jj,   kk) ; CXRAN(nx-ii,   jj,   kk) ;
         CXRAN(ii,ny-jj,   kk) ; CXRAN(nx-ii,ny-jj,   kk) ;
         CXRAN(ii,   jj,nz-kk) ; CXRAN(nx-ii,   jj,nz-kk) ;
         CXRAN(ii,ny-jj,nz-kk) ; CXRAN(nx-ii,ny-jj,nz-kk) ;
       }
   }}}

   for( kk=1 ; kk < nzw ; kk++ ){   /* along ii=0 face */
    for( jj=1 ; jj < nyw ; jj++ ){
      ww = WTAR(0,jj,kk) ;
      if( ww != 0.0f ){   /* just 4 reflections */
        CXRAN(0,jj,   kk) ; CXRAN(0,ny-jj,   kk) ;
        CXRAN(0,jj,nz-kk) ; CXRAN(0,ny-jj,nz-kk) ;
      }
   }}

   for( jj=1 ; jj < nyw ; jj++ ){   /* along kk=0 face */
    for( ii=1 ; ii < nxw ; ii++ ){
      ww = WTAR(ii,jj,0) ;
      if( ww != 0.0f ){   /* just 4 reflections */
        CXRAN(ii,   jj,0) ; CXRAN(nx-ii,   jj,0) ;
        CXRAN(ii,ny-jj,0) ; CXRAN(nx-ii,ny-jj,0) ;
      }
   }}

   for( kk=1 ; kk < nzw ; kk++ ){   /* along jj=0 face */
    for( ii=1 ; ii < nxw ; ii++ ){
      ww = WTAR(ii,0,kk) ;
      if( ww != 0.0f ){   /* just 4 reflections */
        CXRAN(ii,0,   kk) ; CXRAN(nx-ii,0,   kk) ;
        CXRAN(ii,0,nz-kk) ; CXRAN(nx-ii,0,nz-kk) ;
      }
   }}

   for( kk=1 ; kk < nzw ; kk++ ){  /* along ii=jj=0 edge */
     ww = WTAR(0,0,kk) ;
     if( ww != 0.0f ){  /* just 2 reflections */
       CXRAN(0,0,kk) ; CXRAN(0,0,nz-kk) ;
     }
   }

   for( jj=1 ; jj < nyw ; jj++ ){  /* along ii=kk=0 edge */
     ww = WTAR(0,jj,0) ;
     if( ww != 0.0f ){  /* just 2 reflections */
       CXRAN(0,jj,0) ; CXRAN(0,ny-jj,0) ;
     }
   }

   for( ii=1 ; ii < nxw ; ii++ ){  /* along jj=kk=0 edge */
     ww = WTAR(ii,0,0) ;
     if( ww != 0.0f ){  /* just 2 reflections */
       CXRAN(ii,0,0) ; CXRAN(nx-ii,0,0) ;
     }
   }

   CXAR(0,0,0) = CMPLX(0.0f,0.0f) ;   /* the origin ii=jj=kk=0 */
                                      /* set to zero to get zero mean output */
   /* FFT to real space */

   mri_fft3D_inplace( cxim , tar ) ;

   /* create output and exit */

   outar = mri_complex_to_pair( cxim ) ;
   mri_free(cxim) ;

   return outar ;
}

#if 0
/*----------------------------------------------------------------------------*/
/* Main program for testing purposes */

int main( int argc , char *argv[] )
{
   float rr , vv , parm[3] ;
   int_triple ijk ;
   float dxyz=2.0f ; int nxyz=128 ; double ct,wt ;
   MRI_IMAGE *wim ;
   THD_3dim_dataset *dset=NULL ;
   THD_ivec3 nxyz_vec ; THD_fvec3 fxyz_vec , oxyz_vec ;
   unsigned short xran[3] ; unsigned int gseed ; int ithr=1,nbrik=2 ;
   int nthr=1 , ith ;

   parm[0] = 0.66f ;
   parm[1] = 6.62f ;
   parm[2] = 11.09f ;

   if( argc > 1 ){
     nbrik = (int)strtod(argv[1],NULL) ;
     if( nbrik < 2 ) nbrik = 2 ;
   }
   if( nbrik%2 == 1 ) nbrik++ ;

   ijk = get_random_field_size( nxyz , nxyz , nxyz ,
                                dxyz , dxyz , dxyz , parm ) ;

   nxyz = ijk.i ;
   wim = make_radial_weight( nxyz,nxyz,nxyz , dxyz,dxyz,dxyz , parm ) ;

   gseed = ((unsigned int)time(NULL)) + 17*(unsigned int)getpid() ;
   xran[2] = ( gseed        & 0xffff) + (unsigned short)ithr ;
   xran[1] = ((gseed >> 16) & 0xffff) - (unsigned short)ithr ;
   xran[0] = 0x330e                   + (unsigned short)ithr ;

   INFO_message("creating random dataset") ;

   dset = EDIT_empty_copy(NULL) ;
   nxyz_vec.ijk[0] = nxyz ; nxyz_vec.ijk[1] = nxyz ; nxyz_vec.ijk[2] = nxyz ;
   fxyz_vec.xyz[0] = dxyz ; fxyz_vec.xyz[1] = dxyz ; fxyz_vec.xyz[2] = dxyz ;
   oxyz_vec.xyz[0] = 0.0f ; oxyz_vec.xyz[1] = 0.0f ; oxyz_vec.xyz[2] = 0.0f ;

   EDIT_dset_items( dset ,
                      ADN_datum_all  , MRI_float           ,
                      ADN_nxyz       , nxyz_vec            ,
                      ADN_xyzdel     , fxyz_vec            ,
                      ADN_xyzorg     , oxyz_vec            ,
                      ADN_prefix     , "./randomFFT"       ,
                      ADN_nvals      , nbrik               ,
                      ADN_malloc_type, DATABLOCK_MEM_MALLOC,
                      ADN_type       , HEAD_ANAT_TYPE      ,
                      ADN_view_type  , VIEW_ORIGINAL_TYPE  ,
                      ADN_func_type  , ANAT_EPI_TYPE       ,
                    ADN_none ) ;

#ifdef USE_OMP
#pragma omp parallel
 { if( omp_get_thread_num() == 0 ){
     nthr = omp_get_num_threads() ;
     ININFO_message("number of threads = %d",nthr) ;
 }}
#endif

   ct = COX_cpu_time() ; wt = COX_clock_time() ;

AFNI_OMP_START ;
#pragma omp parallel
 { int ib ; MRI_IMARR *abar; MRI_IMAGE *aim,*bim; complex *tar;
   tar = (complex *)malloc(sizeof(complex)*(nxyz+nxyz)) ;
#pragma omp for
   for( ib=0 ; ib < nbrik ; ib+=2 ){
     abar = make_radial_random_field( nxyz,nxyz,nxyz , wim , tar , xran ) ;
     aim = IMARR_SUBIM(abar,0) ; bim = IMARR_SUBIM(abar,1) ;
#pragma omp critical
     { EDIT_substitute_brick( dset , ib   , MRI_float , MRI_FLOAT_PTR(aim) ) ;
       EDIT_substitute_brick( dset , ib+1 , MRI_float , MRI_FLOAT_PTR(bim) ) ;
     }
     FREE_IMARR(abar) ; abar = NULL ;
   }
   free(tar) ;
 }
AFNI_OMP_END ;

   ct = COX_cpu_time()-ct ; wt = COX_clock_time()-wt ;
   ININFO_message("CPU time = %.1f  Clock time = %.1f  Ratio = %.1f",ct,wt,ct/wt) ;

   ININFO_message("writing dataset") ;
   DSET_write(dset) ; WROTE_DSET(dset) ; DSET_delete(dset) ; dset = NULL ;

   exit(0) ;
}
#endif