#!/usr/bin/env python

########################################################################
## 05/2018 Justin Rajendra
## wrapper for Gang's Bayesian Group Analysis

## system libraries
import sys, os, glob, subprocess, re, argparse, csv, signal, textwrap, shutil

## location and name of the R program that is run
R_script = sys.path[0]+'/BayesianGroupAna.R'

## check for valid integers
def int_gt_0(v):
    try:
        vi = int(v)
        if vi >= 1:
            return v
        else:
            raise argparse.ArgumentTypeError("%r is not an integer >= 1."%(v,))
    except:
        raise argparse.ArgumentTypeError("%r is not an integer >= 1."%(v,))

def zero2one(p):
    try:
        float(p)
    except:
        raise argparse.ArgumentTypeError("%r is not positive value < 1."%(p,))
    if float(p) < 0.0 or float(p) > 1.0:
        raise argparse.ArgumentTypeError("%r is not positive value < 1."%(p,))
    return p

## check for rectiness of mvm table
def data_is_rect(mdata):
    if mdata == None: return 1
    if len(mdata) == 0: return 1
    rlen = len(mdata[0])
    for row in mdata:
        if len(row) != rlen: return 0
    return 1

########################################################################
## parse command line arguments / build help

## make parser with help
parser = argparse.ArgumentParser(prog=str(sys.argv[0]),
                                 formatter_class=argparse.RawDescriptionHelpFormatter,
                                 description=textwrap.dedent('''\
------------------------------------------
Overview ~1~

    This program conducts Bayesian Group Analysis (BGA) on a list
    (e.g., 6 or more) of regions of interest (ROIs) as laid out in Chen et al.
    (2018, https://www.biorxiv.org/content/early/2018/02/20/238998).
    Compared to the conventional univariate GLM in which each voxel or ROI is
    considered autonomous and analyzed independently, BGA pools and shares the
    information across the ROIs in a multilevel system. It is the
    probability of incorrect sign, instead of false positive rate that is
    controlled. In other words, there is only one BGA model that incorporates
    the data from all ROIs.

    This will explore the effect of X on Y at each ROI. The computation may
    take a few minutes or more depending on the amount of input data and
    model complexity. The final inferences are conducted through the
    posterior distribution or quantile intervals for each effect that are
    provided in a table in the output. A boxplot can also be generated if
    requested with -plot.

    The computation requires that the R package "brms" be installed
    (e.g., through rPkgsInstall).
    More info on the brms package can be found here:
        https://CRAN.R-project.org/package=brms
    And the brms reference manual is here:
        https://cran.r-project.org/web/packages/brms/brms.pdf

Details ~1~

    Similar to 3dMVM and 3dLME, a data table  should be created containing
    the input data and relevant variables (with at least 3 columns: subject
    labels, ROI labels, response variable values).

    The -dataTable should be formatted as follows:

        Subj  ROI   some_y  some_x other_x
        S001  roi1  0.12    0.056  0.356
        S001  roi2  0.65    0.232  0.231
        S002  roi1  0.14    0.456  0.856
        S002  roi2  0.64    0.432  0.431
        ...

    The Subj and ROI columns must be included with the exact spelling!!
    If there are no x variables, only the intercept will be calculated.

Outputs ~1~

    Given -prefix is "gangBGA" and -x is "some_x", the default outputs are the
    following files:

        gangBGA_summary.txt:
            Summary of the brmsfit object from R.

        gangBGA_rhats.csv:
            rhats for each effect and x variable combination.

        gangBGA_Intercept_table.csv:
            Table with the MedianEst, StdDev, 2.50%, 5%, 50%, 95%, and 97.50%
            of each ROI for the Intercept term.

        gangBGA_some_x_table.csv:
            The same table as the Intercept but for the some_x variable.

Caveats ~1~

    All x variables are centered by default.

    The boxplot with -plot is not a standard boxplot.
    It is a plot of the 2.50%, 5%, 50%, 95%, 97.50% percentiles.
    The coloring of the boxes is determined by where the zero line crosses the
    box and whiskers.
        White:  The zero line crosses the main box (between 5% and 95%).
        Purple: The zero line crosses between the whiskers and the main box.
                (2.50% to 5%) OR (95% to 97.50%)
        Red:    The zero line does not cross the box or the whiskers.

    Additional plot types for -more_plots include (not sure all of these work):
        hist dens hist_by_chain dens_overlay violin intervalsareas
        acf acf_bar trace trace_highlight rhat rhat_hist neff neff_hist
        nuts_acceptance nuts_divergence nuts_stepsize nuts_treedepth
        nuts_energy

    Tables and plots will be created for the intercept and all specified x
    variables separately. So there may be a lot of output.

Examples ~1~

    Minimum requirement only calculates the intercept (may not be useful).
        BayesianGroupAna.py -dataTable my_roi_data.txt -y zscore

    More useful. Calculates 2 x variables and saves out some plots.
        BayesianGroupAna.py -dataTable my_roi_data.txt  \\
                            -prefix dock_of_the_bayes   \\
                            -y zscore -x some_x other_x \\
                            -chains 4 -iterations 1000  \\
                            -plot -more_plots rhat violin

------------------------------------------

Options ~1~

                                 '''),epilog=textwrap.dedent('''\
------------------------------------------
Justin Rajendra circa 05/2018
4 Gang Box...
Keep on keeping on!
------------------------------------------
                                 '''))

parser._optionals.title = 'Optional arguments'
required = parser.add_argument_group('Required arguments')

parser._action_groups.reverse()

## required
required.add_argument('-dataTable',type=str,help='Input text file.',
                      required=True)
required.add_argument('-y',type=str,help='Column name for the y variable.',
                      required=True,metavar='VAR')

## optional
parser.add_argument('-help',action='help',help='Show this help.')
parser.add_argument('-prefix',type=str,default="BayesOut",
                    help="Name of the output file.")
parser.add_argument('-x',type=str,default='1',nargs='+',metavar='VAR',
                      help=('Column name for the x variables. '+
                            'If not specified, only the intercept will be '+
                            'added.'))
parser.add_argument('-no_center',action='store_true',default=False,
                    help=("Disable centering on the x variables. "+
                          "Maybe useful if you centered manually."))
parser.add_argument('-iterations',type=int_gt_0,default=1000,metavar='ITER',
                    help=("Number of total iterations per chain including"+
                          " warmup. Default [1000]"))
parser.add_argument('-chains',type=int_gt_0,default=4,
                    help="Number of Markov chains. Default [4]")
parser.add_argument('-control_list',type=str,default="",metavar="LIST",
                    help=("Comma separated list of control parameters to"+
                          " pass to the brm function. (example:"+
                          " 'adapt_delta=0.99,max_treedepth=20')."+
                          " Default is the brm function defaults"))
parser.add_argument('-plot',action='store_true',default=False,
                    help="Output box, fit, and posterior prediction plots.")
parser.add_argument('-more_plots',type=str,default='',nargs='+',
                    metavar='TYPE',help=('Output "stanplots" given different'+
                                         ' types of plot names.'))
parser.add_argument('-RData',action='store_true',default=False,
                    help="Save the R session workspace and data.")
parser.add_argument('-seed',type=int_gt_0,default=1234,
                    help="Seed to generate random number. Default [1234]")
parser.add_argument('-overwrite',action='store_true',default=False,
                    help="Overwrites the output files.")

## if nothing, show help
if len(sys.argv) == 1:
    parser.print_help()
    sys.exit(1)

## do the parsing
args = parser.parse_args()

########################################################################
## collect the arguments (need strings) and validate
InFile = args.dataTable
y_var = args.y
x_var = args.x
OutFile = args.prefix
NumIter = str(args.iterations)
NumChains = str(args.chains)
control_list = args.control_list
InSeed = str(args.seed)
StanPlots = args.more_plots
overwrite = args.overwrite

## make caps for R code (check for plots)
plot = "TRUE" if args.plot or len(StanPlots) > 0 else "FALSE"
NoCenter = "TRUE" if args.no_center else "FALSE"
RData = "TRUE" if args.RData else "FALSE"

########################################################################
## validate

## check for output rhats
if os.path.isfile(OutFile+"_rhats.csv") and not overwrite:
    print("\nERROR: Output file ("+OutFile+"_rhats.csv) exists!!\n")
    sys.exit(1)

## check for input file
if not os.path.isfile(InFile):
    print("\nERROR: Input file ("+InFile+") not found!!\n")
    sys.exit(1)

## make sure the table is valid
test_table = []
with open(InFile) as fin:
    rows = (re.split('[ \t,;]+', line) for line in fin)  ## any delimiter?
    for row in rows:
        test_table.append(row)
if not data_is_rect(test_table):
    print("\nERROR: Input file ("+InFile+") is not a rectangular table!!\n")
    sys.exit(1)

## check plotting options
## standard plot choices
stanplot_types = ['hist','dens','hist_by_chain','dens_overlay','violin',
                  'intervals','areas','acf','acf_bar','trace',
                  'trace_highlight','rhat','rhat_hist','neff',
                  'neff_hist','nuts_acceptance','nuts_divergence',
                  'nuts_stepsize','nuts_treedepth',' nuts_energy']

## do not work yet: 'scatter',

## check the plotting types
if plot == "TRUE":
    print("\nPlotting: \nboxes\nfit\nPostPred")
    if set(StanPlots) < set(stanplot_types):
        for s in StanPlots: print(s)
        print("")
        StanPlots = " ".join(StanPlots)
    else:
        print("")
        for p in StanPlots:
            if p not in stanplot_types:
                print("ERROR: "+p+" is not a valid plot type!!!")
        print("")
        sys.exit(1)

########################################################################
## check the variable names

print("\nVariables in the table are:\n")
var_names = [item.strip() for item in test_table[0]]
for var in var_names: print(var)
print("")

if "Subj" not in var_names:
    print("\nERROR: Missing 'Subj' column!! Spelling counts!!\n")
    sys.exit(1)
if "ROI" not in var_names:
    print("\nERROR: Missing 'ROI' column!! Spelling counts!!\n")
    sys.exit(1)
if y_var not in var_names:
    print("\nERROR: "+y_var+" is not in the list of variables!! "+
          "Spelling counts!!\n")
    sys.exit(1)
if x_var != '1':
    for x in x_var:
        if x not in var_names:
            print("\nERROR: "+x+" is not in the list of variables!!"+
                  " Spelling counts!!\n")
            sys.exit(1)

## convert x_vars to space separated string
x_var = " ".join(x_var)

## add intercept if necessary
if x_var is not "1":
    x_var = "1 "+x_var

########################################################################
## run the R script with arguments

subprocess.call(['Rscript',R_script,InFile,y_var,x_var,OutFile,
                 NumIter,NumChains,control_list,InSeed,os.getcwd(),plot,
                 StanPlots,RData,NoCenter])

## check output
if os.path.isfile(OutFile+"_rhats.csv"):
    print("\nProcessing complete! Output is "+OutFile+"_rhats.csv\n")
    sys.exit(0)
else:
    print("\nERROR: Something failed in R!! There is no output file!\n")
    sys.exit(1)