/*-------------------------------------------------------------------------*/ /* This file contains code to calculate Kendall's Tau in O(N log N) time in * a manner similar to the following reference: * * A Computer Method for Calculating Kendall's Tau with Ungrouped Data * William R. Knight Journal of the American Statistical Association, * Vol. 61, No. 314, Part 1 (Jun., 1966), pp. 436-439 * * Copyright 2010 David Simcha * * License: * Boost Software License - Version 1.0 - August 17th, 2003 * * Permission is hereby granted, free of charge, to any person or organization * obtaining a copy of the software and accompanying documentation covered by * this license (the "Software") to use, reproduce, display, distribute, * execute, and transmit the Software, and to prepare derivative works of the * Software, and to permit third-parties to whom the Software is furnished to * do so, all subject to the following: * * The copyright notices in the Software and this entire statement, including * the above license grant, this restriction and the following disclaimer, * must be included in all copies of the Software, in whole or in part, and * all derivative works of the Software, unless such copies or derivative * works are solely in the form of machine-executable object code generated by * a source language processor. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * */ /*-------------------------------------------------------------------------*/ /**---------------- Adapted for AFNI by RWCox - April 2010 ---------------**/ /*-------------------------------------------------------------------------*/ #include #include #include #include #include #include #ifdef SOLARIS_OLD /* 'if' might be overkill, but just to be minimal... */ /* 10 May 2010 [rickr] */ #include "machdep.h" #else #include #endif /*-------------------------------------------------------------------------*/ /* Sorts in place, returns the bubble sort distance between the input array * and the sorted array. */ static int insertionSort(float *arr, int len) { int maxJ, i,j , swapCount = 0; /* printf("enter insertionSort len=%d\n",len) ; */ if(len < 2) { return 0; } maxJ = len - 1; for(i = len - 2; i >= 0; --i) { float val = arr[i]; for(j=i; j < maxJ && arr[j + 1] < val; ++j) { arr[j] = arr[j + 1]; } arr[j] = val; swapCount += (j - i); } return swapCount; } /*-------------------------------------------------------------------------*/ static int merge(float *from, float *to, int middle, int len) { int bufIndex, leftLen, rightLen , swaps ; float *left , *right; /* printf("enter merge\n") ; */ bufIndex = 0; swaps = 0; left = from; right = from + middle; rightLen = len - middle; leftLen = middle; while(leftLen && rightLen) { if(right[0] < left[0]) { to[bufIndex] = right[0]; swaps += leftLen; rightLen--; right++; } else { to[bufIndex] = left[0]; leftLen--; left++; } bufIndex++; } if(leftLen) { #pragma omp critical (MEMCPY) memcpy(to + bufIndex, left, leftLen * sizeof(float)); } else if(rightLen) { #pragma omp critical (MEMCPY) memcpy(to + bufIndex, right, rightLen * sizeof(float)); } return swaps; } /*-------------------------------------------------------------------------*/ /* Sorts in place, returns the bubble sort distance between the input array * and the sorted array. */ static int mergeSort(float *x, float *buf, int len) { int swaps , half ; /* printf("enter mergeSort\n") ; */ if(len < 10) { return insertionSort(x, len); } swaps = 0; if(len < 2) { return 0; } half = len / 2; swaps += mergeSort(x, buf, half); swaps += mergeSort(x + half, buf + half, len - half); swaps += merge(x, buf, half, len); #pragma omp critical (MEMCPY) memcpy(x, buf, len * sizeof(float)); return swaps; } /*-------------------------------------------------------------------------*/ static int getMs(float *data, int len) /* Assumes data is sorted */ { int Ms = 0, tieCount = 0 , i ; /* printf("enter getMs\n") ; */ for(i = 1; i < len; i++) { if(data[i] == data[i-1]) { tieCount++; } else if(tieCount) { Ms += (tieCount * (tieCount + 1)) / 2; tieCount = 0; } } if(tieCount) { Ms += (tieCount * (tieCount + 1)) / 2; } return Ms; } /*-------------------------------------------------------------------------*/ /* This function calculates the Kendall correlation tau_b. * The arrays arr1 should be sorted before this call, and arr2 should be * re-ordered in lockstep. This can be done by calling * qsort_floatfloat(len,arr1,arr2) * for example. * Note also that arr1 and arr2 will be modified, so if they need to * be preserved, do so before calling this function. */ float kendallNlogN( float *arr1, float *arr2, int len ) { int m1 = 0, m2 = 0, tieCount, swapCount, nPair, s,i ; float cor ; /* printf("enter kendallNlogN\n") ; */ if( len < 2 ) return (float)0 ; nPair = len * (len - 1) / 2; s = nPair; tieCount = 0; for(i = 1; i < len; i++) { if(arr1[i - 1] == arr1[i]) { tieCount++; } else if(tieCount > 0) { insertionSort(arr2 + i - tieCount - 1, tieCount + 1); m1 += tieCount * (tieCount + 1) / 2; s += getMs(arr2 + i - tieCount - 1, tieCount + 1); tieCount = 0; } } if(tieCount > 0) { insertionSort(arr2 + i - tieCount - 1, tieCount + 1); m1 += tieCount * (tieCount + 1) / 2; s += getMs(arr2 + i - tieCount - 1, tieCount + 1); } swapCount = mergeSort(arr2, arr1, len); m2 = getMs(arr2, len); s -= (m1 + m2) + 2 * swapCount; if( m1 < nPair && m2 < nPair ) cor = s / ( sqrtf((float)(nPair-m1)) * sqrtf((float)(nPair-m2)) ) ; else cor = 0.0f ; return cor ; } /*-------------------------------------------------------------------------*/ /* This function uses a simple O(N^2) implementation. It probably has a * smaller constant and therefore is useful in the small N case, and is also * useful for testing the relatively complex O(N log N) implementation. */ float kendallSmallN( float *arr1, float *arr2, int len ) { int m1 = 0, m2 = 0, s = 0, nPair , i,j ; float cor ; /* printf("enter kendallSmallN\n") ; */ for(i = 0; i < len; i++) { for(j = i + 1; j < len; j++) { if(arr2[i] > arr2[j]) { if (arr1[i] > arr1[j]) { s++; } else if(arr1[i] < arr1[j]) { s--; } else { m1++; } } else if(arr2[i] < arr2[j]) { if (arr1[i] > arr1[j]) { s--; } else if(arr1[i] < arr1[j]) { s++; } else { m1++; } } else { m2++; if(arr1[i] == arr1[j]) { m1++; } } } } nPair = len * (len - 1) / 2; if( m1 < nPair && m2 < nPair ) cor = s / ( sqrtf((float)(nPair-m1)) * sqrtf((float)(nPair-m2)) ) ; else cor = 0.0f ; return cor ; } #if 0 int main( int argc , char *argv[] ) { float a[100], b[100]; float smallNCor, smallNCov, largeNCor, largeNCov; int i; int N ; #define NBOT 10 #define NTOP 2000 #define NSTEP 10 /* Test the small N version against a few values obtained from the old * version in R. Only exercising the small N version because the large * N version requires the first array to be sorted and the second to be * reordered in lockstep before it's called.*/ { float a1[] = {1,2,3,5,4}; float a2[] = {1,2,3,3,5}; float b1[] = {8,6,7,5,3,0,9}; float b2[] = {3,1,4,1,5,9,2}; float c1[] = {1,1,1,2,3,3,4,4}; float c2[] = {1,2,1,3,3,5,5,5}; printf("kSmall 5 = %g\n", kendallSmallN(a1, a2, 5) ) ; printf("kSmall 7 = %g\n", kendallSmallN(b1, b2, 7) ) ; printf("kSmall 8 = %g\n", kendallSmallN(c1, c2, 8) ) ; } /* Now that we're confident that the simple, small N version works, * extensively test it against the much more complex and bug-prone * O(N log N) version. */ for(i = 0; i < 100; i++) { int j, len; for(j = 0; j < 100; j++) { a[j] = rand() % 30; b[j] = rand() % 30; } len = rand() % 50 + 50; /* The large N version assumes that the first array is sorted. This * will usually be made true in R before passing the arrays to the * C functions. */ insertionSort(a, len); smallNCor = kendallSmallN(a, b, len); largeNCor = kendallNlogN (a, b, len); printf("Cor: kSmall = %g kNlogN = %g\n",smallNCor,largeNCor) ; } printf("-- short tests done --\n") ; /* Speed test. Compare the O(N^2) version, which is very similar to * R's current impl, to my O(N log N) version. */ { float *foo, *bar, *buf; int i; float startTime, stopTime; foo = (float *) malloc(NTOP * sizeof(float)); bar = (float *) malloc(NTOP * sizeof(float)); buf = (float *) malloc(NTOP * sizeof(float)); for( N=NBOT ; N <= NTOP ; N += NSTEP ){ for(i = 0; i < N; i++) { foo[i] = rand(); bar[i] = rand(); } startTime = clock(); smallNCor = kendallSmallN(foo, bar, N); stopTime = clock(); printf("N=%d: slow = %f ms val = %g\n", N,stopTime - startTime,smallNCor); startTime = clock(); /* Only sorting first array. Normally the second one would be * reordered in lockstep. */ #if 1 qsort_floatfloat( N , foo , bar ) ; #else mergeSort(foo, buf, N); #endif largeNCor = kendallNlogN(foo, bar, N); stopTime = clock(); printf("N=%d: fast = %f ms val = %g\n", N,stopTime - startTime,largeNCor); } } return 0; } #endif