# The following function computes the new correlation coefficient, and returns the P-value for testing independence unless otherwise specified. Removes NAs and converts factor variables to integers automatically, unless otherwise specified. In general, it is safe to just supply x and y, and leave the other parameters to their default values.
xi = function(x, y, pvalue = T, ties = T, method = "asymptotic", nperm = 1000, factor = T) {
# x and y are the data vectors
# The P-value for the test of independence is returned if pvalue = T. Otherwise, only the coefficient is returned.
# If ties = T, the algorithm assumes that the data has ties and employs the more elaborated theory for calculating the P-value. Otherwise, it uses the simpler theory. There is no harm in putting ties = T even if there are no ties.
# method = "asymptotic" returns P-values computed by the asymptotic theory. If method = "permutation", a permutation test with nperm permutations is employed to estimate the P-value. Usually, there is no need for the permutation test. The asymptotic theory is good enough.
# nperm is the number of permutations for the permutation test, if needed.
# factor = T results in the algorithm checking whether x and y are factor variables and converting them to integers if they are. If it is known that the variables are numeric, a little bit of time can be saved by setting factor = F.
# NAs are removed here:
ok = complete.cases(x,y)
x = x[ok]
y = y[ok]
# Factor variables are converted to integers here:
if (factor == T) {
if (!is.numeric(x)) x = as.numeric(factor(x))
if (!is.numeric(y)) y = as.numeric(factor(y))
}
n = length(x) # n is the sample size.
PI = rank(x, ties.method = "random") # PI is the rank vector for x, with ties broken at random
f = rank(y, ties.method = "max")/n # f[i] is number of j s.t. y[j] <= y[i], divided by n.
g = rank(-y, ties.method = "max")/n # g[i] is number of j s.t. y[j] >= y[i], divided by n.
ord = order(PI) # order of the x's, ties broken at random.
f = f[ord] # Rearrange f according to ord.
# xi is calculated in the next three lines:
A1 = mean(abs(f[1:(n-1)] - f[2:n]))*(n-1)/(2*n)
C = mean(g*(1-g))
xi = 1 - A1/C
# If P-value needs to be computed:
if (pvalue == T) {
# If there are no ties, return xi and theoretical P-value:
if (ties == F) return(list(xi = xi, pval = 1 - pnorm(sqrt(n)*xi/sqrt(2/5))))
# If there are ties, and the theoretical method is to be used for calculation P-values:
if (method == "asymptotic") {
# The following steps calculate the theoretical variance in the presence of ties:
q = sort(f)
ind = c(1:n)
ind2 = 2*n - 2*ind + 1
a = mean(ind2*q*q)/n
c = mean(ind2*q)/n
cq = cumsum(q)
m = (cq + (n - ind)*q)/n
b = mean(m^2)
v = (a - 2*b + c^2)/(C^2)
# Return xi and P-value:
return(list(xi = xi, pval = 1 - pnorm(sqrt(n)*xi/sqrt(v))))
}
# If permutation test is to be used for calculating P-value:
if (method == "permutation") {
r = rep(0, nperm)
for (i in 1:nperm) {
x1 = runif(n, 0, 1)
r[i] = xi(x1,y)$xi
}
# Return xi and P-value based on permutation test:
return(list(xi = xi, pval = mean(r > xi)))
}
cat("Invalid method. Use either asymptotic or permutation.")
}
# If only xi is desired, return xi:
else return(xi)
}