#!/usr/bin/env python # python3 status: compatible # system libraries import sys, os, glob if 1 : # for testing, might add the current dir and ~/abin to the PATH try: sys.path.extend(['.', '%s/abin' % os.getenv('HOME')]) except: pass # AFNI libraries import afni_util as UTIL import option_list as OL # ---------------------------------------------------------------------- # globals g_help_string = """ ============================================================================= gen_ss_review_table.py - generate a table from ss_review_basic output files Given many output text files (e.g. of the form out.ss_review.SUBJECT.txt), make a tab-delimited table of output fields, one infile/subject per line. The program is based on processing lines of the form: description label : value1 value2 ... A resulting table will have one row per input, and one column per value, with columns separated by a tab character, for input into a spreadsheet. The top row of the output will have labels. The second row will have value_N entries, corresponding to the labels. The first column will be either detected group names from the inputs, or will simply be the input file names. * See "gen_ss_review_scripts.py -help_fields" for short descriptions of the fields. ------------------------------------------ examples: 1. typical usage: input all out.ss_review files across groups and subjects gen_ss_review_table.py -tablefile review_table.xls \\ -infiles group.*/subj.*/*.results/out.ss_review.* 2. just show label table gen_ss_review_table.py -showlabs -infiles gr*/sub*/*.res*/out.ss_rev* ------------------------------------------ terminal options: -help : show this help -hist : show the revision history -ver : show the version number ------------------------------------------ process options: -infiles FILE1 ... : specify @ss_review_basic output text files to process e.g. -infiles out.ss_review.subj12345.txt e.g. -infiles group.*/subj.*/*.results/out.ss_review.* The resulting table will be based on all of the fields in these files. This program can be used as a pipe for input and output, using '-' or file stream names. -overwrite : overwrite the output -tablefile, if it exists Without this option, an existing -tablefile will not be overwritten. -separator SEP : use SEP for the label/vals separator (default = ':') e.g. -separator : e.g. -separator tab e.g. -separator whitespace Use this option to specify the separation character or string between the labels and values. -showlabs : display counts of all labels found, with parents This is mainly to help create a list of labels and parent labels. -show_missing : display all missing keys Show all missing keys from all infiles. -tablefile OUT_NAME : write final table to the given file If the specified file already exists, it will not be overwritten unless the -overwrite option is specified. -verb LEVEL : be verbose (default LEVEL = 1) ------------------------------------------ Thanks to J Jarcho for encouragement and suggestions. R Reynolds April 2014 ============================================================================= """ g_todo = """ todo list: - when an unknown label is found, have user inform rick? - execute @ss_review_basic scripts for text output? """ g_history = """ gen_ss_review_table.py history: 0.0 Apr 07, 2014 - initial version: does the basic job 0.1 Apr 08, 2014 - try to parse SID/GID from input file names 0.2 Apr 09, 2014 - help update, separators, parents 0.3 Jun 26, 2014 - label typos: track 'degress of freedom' as 'degrees' 0.4 Aug 25, 2014 - defined oind 0.5 May 19, 2014 - mention gen_ss_review_scripts.py -help_fields 0.6 Aug 19, 2015 - added -show_missing, to display missing keys 0.7 Oct 28, 2015 - make 'a/E mask Dice coef' parent of 'mask correlation' 0.8 Aug 17, 2016 - 'blur estimates (FWHM)' is parent of 'blur estimates' 1.0 Dec 28, 2017 - python3 compatible """ g_version = "gen_ss_review_table.py version 1.0, December 28, 2017" class MyInterface: """main interface class This uses lib_1D.py as an example.""" def __init__(self, verb=1): # main variables self.valid_opts = None self.user_opts = None self.showlabs = 0 # flag - print labels at end self.show_missing = 0 # flag - print missing keys # control self.separator = ':' # field separator (only first applies) self.seplen = 1 # length, to avoid recomputing self.overwrite = 0 self.verb = 1 # infile name parsing self.infiles = [] self.snames = [] self.gnames = [] self.tablefile = '' # result variables self.labels = [] # list of input labels self.parents = [] # list of input label parents self.ldict = [] # corresponding list of infile dictionaries self.maxcounts = {} # max count of elements per dict entry self.subjcounts = {} # number of infiles having each label # initialize valid_opts self.valid_opts = self.get_valid_opts() def get_valid_opts(self): vopts = OL.OptionList('valid opts') # short, terminal arguments vopts.add_opt('-help', 0, [], helpstr='display program help') vopts.add_opt('-hist', 0, [], helpstr='display the modification history') vopts.add_opt('-ver', 0, [], helpstr='display the current version number') # general options vopts.add_opt('-infiles', -1, [], helpstr='input text files (from @ss_review_basic)') vopts.add_opt('-overwrite', 0, [], helpstr='allow overwrite for output table file') vopts.add_opt('-separator', 1, [], helpstr="specify field separator (default=':')") vopts.add_opt('-showlabs', 0, [], helpstr='show list of labels found') vopts.add_opt('-show_missing', 0, [], helpstr='show all missing keys') vopts.add_opt('-tablefile', 1, [], helpstr='file name for output table') vopts.add_opt('-verb', 1, [], helpstr='set the verbose level (def=1)') vopts.sort() return vopts def process_options(self): """return 1 on valid and exit return 0 on valid and continue return -1 on invalid """ argv = sys.argv # process any optlist_ options self.valid_opts.check_special_opts(argv) # process terminal options without the option_list interface # (so that errors are not reported) # if no arguments are given, do default processing if '-help' in argv or len(argv) < 2: print(g_help_string) return 1 if '-hist' in argv: print(g_history) return 1 if '-show_valid_opts' in argv: self.valid_opts.show('', 1) return 1 if '-ver' in argv: print(g_version) return 1 # ============================================================ # read options specified by the user self.user_opts = OL.read_options(argv, self.valid_opts) uopts = self.user_opts # convenience variable if not uopts: return -1 # error condition # ------------------------------------------------------------ # process verb first val, err = uopts.get_type_opt(int, '-verb') if val != None and not err: self.verb = val # ------------------------------------------------------------ # process options sequentially, to make them like a script errs = 0 for opt in self.user_opts.olist: # check for anything to skip if opt.name == '-verb': pass elif opt.name == '-infiles': self.infiles, err = uopts.get_string_list('', opt=opt) if self.infiles == None or err: print('** failed to read -infiles list') errs +=1 self.parse_infile_names() elif opt.name == '-overwrite': self.overwrite = 1 elif opt.name == '-separator': self.separator, err = uopts.get_string_opt('', opt=opt) if self.separator == None or err: print("** bad -tablefile option") errs += 1 if self.separator == 'tab': self.separator = '\t' elif self.separator == 'whitespace': self.separator = 'ws' self.seplen = len(self.separator) elif opt.name == '-showlabs': self.showlabs = 1 elif opt.name == '-show_missing': self.show_missing = 1 elif opt.name == '-tablefile': self.tablefile, err = uopts.get_string_opt('', opt=opt) if self.tablefile == None or err: print("** bad -tablefile option") errs +=1 else: oind = self.user_opts.olist.index(opt) print('** unknown option # %d: %s' % (oind+1, opt.name)) errs += 1 break # allow early and late error returns if errs: return -1 # ------------------------------------------------------------ # apply any trailing logic if len(self.infiles) < 1: print('** missing -infiles option') errs += 1 if errs: return -1 return 0 def parse_infiles(self): """make a list of field names and dictionaries (fill self.labels, self.ldict) for field name list, sorting is hard if fields are not consistent - for a new field (after file 0), try inserting after or before surrounding fields make list of for each infile: make a dictionary of name:valuelist """ errs = 0 # check file existence first for ifile in self.infiles: if ifile in ['-', 'stdin']: pass elif not os.path.isfile(ifile): print('** input file not found: %s' % ifile) errs += 1 if errs: return 1 # check for existence separately for ifile in self.infiles: if self.verb > 2: print('++ processing %s ...' % ifile) # open, read, close if ifile in ['-', 'stdin']: fp = sys.stdin else: try: fp = open(ifile) except: print("** failed to open input file %s" % ifile) return 1 ilines = fp.readlines() if ifile != sys.stdin: fp.close() # empty should be a terminal failure if len(ilines) < 1: print('** empty input for file %s' % ifile) return 1 if len(self.labels) == 0: rv, self.labels = self.make_labels(ilines) self.parents = [self.find_parent_label(lab) for lab in self.labels] if rv: return 1 rv, ldict = self.make_dict(ilines) if rv: return 1 self.ldict.append(ldict) return 0 def make_labels(self, ilines): """parse a list of the form LABEL : VALUES ... and return a LABEL list (with no trailing separator (':')) initialize maxcounts, subjcounts here """ llist = [] for lind, lstr in enumerate(ilines): # get label and value list rv, label, vals = self.get_label_vals(lstr) if rv < 1: continue nvals = len(vals) # label = self.find_parent_label(label) if self.verb > 2: print('++ label: %s, %d val(s)' % (label, nvals)) llist.append(label) self.maxcounts[label] = nvals self.subjcounts[label] = 0 if not UTIL.vals_are_unique(llist): print('** warning: labels are not unique, will use only last values') llist = UTIL.get_unique_sublist(llist) return 0, llist def find_parent_label(self, label): # try to replace any old fields with new ones if label == 'maximum F-stat': return 'maximum F-stat (masked)' if label == 'num censored TRs per run': return 'num TRs per run (censored)' if label == 'num TRs per stim': return 'num TRs per stim (orig)' if label == 'degress of freedom left': # typo (fixed 6/25/14) return 'degrees of freedom left' if label == 'degress of freedom used': # typo (fixed 6/25/14) return 'degrees of freedom used' if label == 'anat/EPI mask correlation': return 'anat/EPI mask Dice coef' if label == 'blur estimates': return 'blur estimates (FWHM)' return label def make_dict(self, ilines): """parse a list of the form LABEL : VALUES ... and return a dictionary of dd[LABEL] = [values] monitor maxcounts accumulate subjcounts """ ldict = {} for lind, lstr in enumerate(ilines): # get label and value list rv, label, vals = self.get_label_vals(lstr) if rv < 1: continue nvals = len(vals) # label = self.find_parent_label(label) if self.verb > 3: print('++ dict[%s] = %s' % (label, vals)) # if new label, try parent, else add if label not in self.labels: parent = self.find_parent_label(label) if parent in self.parents: ll = self.labels[self.parents.index(parent)] if self.verb > 3: print('-- converting label %s to %s' % (label, ll)) label = ll else: self.insert_new_label(label, lind, nvals) ldict[label] = vals self.update_max_counts(label, nvals) return 0, ldict def get_label_vals(self, line, getvals=1): """parse a line into label and values special case: if separator == ws (whitespace) split the whole line return status, label, vals (unless getvals==0) status: -1 : error 0 : no separator found 1 : success """ # either split whole line (for whitespace) or go after specific separator if self.separator == 'ws': fields = line.split() label = fields.pop(0) vals = fields if label == '': return 0, '', [] else: cind = line.find(self.separator) if cind < 0: return 0, '', [] label = line[0:cind].strip() vals = line[cind+self.seplen:].split() if self.verb > 4: print('-- GLV: label %s, vals %s' % (label, vals)) return 1, label, vals def update_max_counts(self, label, nvals): """update maxcounts and subjcounts""" if label not in self.maxcounts: if self.verb > 1: print('** found new label key: %s' % label) self.maxcounts[label] = nvals else: # rcr - safe as one line? will it be parsed? if nvals > self.maxcounts[label]: self.maxcounts[label] = nvals self.subjcounts[label] += 1 def insert_new_label(self, label, index, nvals): """insert the new label into the labels list and init maxcounts""" if label in self.labels: return self.labels.append(label) self.parents.append(self.find_parent_label(label)) self.maxcounts[label] = nvals self.subjcounts[label] = 0 def parse_infile_names(self): """try to get subject and possibly group names from infiles fill self.snames and self.gnames, if possible 1. get SID - if files look like out.ss_review.SID.txt, that is a good start - else, look for varying part of filename 2. get GID - replace SID in infile names and for varying group name """ rv, slist = UTIL.list_minus_pref_suf(self.infiles,'out.ss_review.','.txt') if rv < 0: return if rv > 0: if self.verb > 1: print('++ trying to get SID from glob form') slist = UTIL.list_minus_glob_form(self.infiles, strip='dir') else: if self.verb > 1: print("++ have SIDs from 'out.ss_reiview' form") if len(slist) == 0: if self.verb > 1: print("-- empty SID list") return # make sure names are unique and not empty if not UTIL.vals_are_unique(slist): if self.verb > 1: print('-- SIDs not detected: not unique') return minlen = min([len(ss) for ss in slist]) if minlen < 1: if self.verb > 1: print('-- SIDs not detected: some would be empty') return # we have a subject list self.snames = slist # now go for GID, start by replacing SIDs in infiles newfiles = [fname.replace(slist[ind], 'SUBJ') for ind, fname in enumerate(self.infiles)] if UTIL.vals_are_constant(newfiles): print('-- no groups detected from filenames') return # okay, try to make a group list glist = UTIL.list_minus_glob_form(newfiles) # cannot have dirs in result for gid in glist: if gid.find('/') >= 0: if self.verb>1: print('-- no GIDs, dirs vary in multiple places') return minlen = min([len(ss) for ss in glist]) if minlen < 1: if self.verb > 1: print('-- GIDs not detected: some would be empty') return if self.verb > 1: print("++ have GIDs from infiles") self.gnames = glist def display_labels(self): """display the final labels list""" nsubj = len(self.infiles) print('-- final label table (length %d):' % len(self.labels)) for label in self.labels: nv = self.maxcounts[label] if nv == 1: cstr = '%3d val' % nv else: cstr = '%3d vals' % nv nv = self.subjcounts[label] if nv == 1: sstr = '%3d file' % nv else: sstr = '%3d files' % nv if nv < nsubj: short = ' (short)' else: short = '' print('%-30s : %-10s : %-10s%s' % (label, cstr, sstr, short)) def write_table(self): if not self.tablefile: if self.verb: print('-- no tablefile to write') return 0 if len(self.labels) < 1: print('** no labels for output table') return 1 if len(self.ldict) < 1: print('** no label dictionaries') return 1 # open output file if self.tablefile in ['-', 'stdout']: fp = sys.stdout elif os.path.exists(self.tablefile) and not self.overwrite: print('** output table file %s exists, and no overwrite given' \ % self.tablefile) return 1 else: try: fp = open(self.tablefile, 'w') except: print("** failed to open table '%s' for writing" % self.tablefile) return 1 if self.write_header_lines(fp): return 1 if self.write_value_lines(fp): return 1 if fp != sys.stdout: fp.close() return 0 def write_header_lines(self, fp): """write 2 header lines: - the field labels - the list of corresponding values start with either group name (if they exist) or infile name next is subject name, if they exist Each field label should take as many columns as its values. """ if len(self.labels) < 1: return 1 # labels, starting with input files # start with group or infile string, along with subject, if possible if len(self.gnames) == len(self.infiles): fp.write('group') else: fp.write('infile') # if len(self.snames) == len(self.infiles): fp.write('\tsubject') for label in self.labels: nf = self.maxcounts[label]-1 fp.write('\t%s'%label) fp.write('\t'*nf) fp.write('\n') # next line: group and subject, if possible fp.write('value') # this is for group/infile # if len(self.snames) == len(self.infiles): fp.write('\tvalue') for label in self.labels: nf = self.maxcounts[label] for ind in range(nf): fp.write('\tvalue_%d' % (ind+1)) fp.write('\n') def write_value_lines(self, fp): """write value lines, "left justified" to maxcount fields for each infile for each label if dict[label]: print values print any needed tabs """ if len(self.labels) < 1: return 1 nfiles = len(self.infiles) # labels, starting with input files # start with subject, if possible dosubj = len(self.snames) == len(self.infiles) dogrp = len(self.gnames) == len(self.infiles) for ind, infile in enumerate(self.infiles): # first is group or infile if dogrp: fp.write('%s' % self.gnames[ind]) else: # infile instead of group if infile == '-': fp.write('stdin') else: fp.write('%s' % infile) # subject, if possible (repeat?) # if dosubj: fp.write('\t%s' % self.snames[ind]) for label in self.labels: nf = self.maxcounts[label] try: vals = self.ldict[ind][label] except: if self.verb>2:print('** infile %s missing key %s'%(infile,label)) vals = [] nv = len(vals) if nv > 0: fp.write('\t'+'\t'.join(vals)) if nf > nv: fp.write('\t'*(nf-nv)) fp.write('\n') def display_missing(self): """show files where keys are missing """ if len(self.labels) < 1: return 1 nfiles = len(self.infiles) # first generate list of missing labels per input file # (plus allmissing: a list of all missing labels) allmissing = [] mlist = [] # list of file, lablist of missing labels for ind, infile in enumerate(self.infiles): missing = [] for label in self.labels: if label not in self.ldict[ind]: missing.append(label) if not label in allmissing: allmissing.append(label) if len(missing) > 0: mlist.append([infile, missing]) if len(mlist) == 0: return # note longest infile name length lens = [len(mm[0]) for mm in mlist] maxflen = max(lens) # --- set oneline, based on max missing labels and max label length --- # note maximum number of missing labels (over files) lens = [len(mm[1]) for mm in mlist] maxmissing = max(lens) # note longest (missing) label lens = [len(lab) for lab in allmissing] maxllen = max(lens) oneline = maxmissing < 2 or maxllen <= 10 # show results, on one or multiple lines, each for mm in mlist: infile = mm[0] missing = mm[1] if oneline: print('missing keys in %-*s : %s' \ % (maxflen, infile, ', '.join(missing))) else: for lab in missing: print('missing key in %-*s : %s' % (maxflen, infile, lab)) def main(): me = MyInterface() if not me: return 1 rv = me.process_options() if rv > 0: return 0 # valid and exit if rv < 0: # error and exit print('** failed to process options...') return 1 if me.parse_infiles(): return 1 if me.write_table(): return 1 if me.showlabs: me.display_labels() if me.show_missing: me.display_missing() return 0 if __name__ == '__main__': sys.exit(main())