#Load requried packages
require(Boruta)
require(randomForest)
require(GGally)   # version 1.3.0

#Set workspace
setwd("D:/Area2/HabMap/V2") # Input workspace path

#Read sample data with extracted predictor variable values (txt file, comma separated)
#First column: response variable, column 2 to n: predictor features
sdata <- read.table("Iverryggen_training_data.txt", header = TRUE, sep = ",")

preds <- sdata[2:ncol(sdata)]

#Boruta feature selection
B1 <- Boruta(as.factor(sdata[[1]]) ~ .,data=preds, pValue = 0.05,
             maxRuns = 500)
implist <- names(B1$finalDecision[B1$finalDecision =='Confirmed'])

#Print and plot results on screen
print(B1)
par(mar=c(6.3,4,1,1))
plot(B1, las=2, xlab = "")

#Export Boruta figure
jpeg("Boruta.jpeg", width = 18, height = 10, res = 500, units = "cm", pointsize = 8, quality = 100)
par(mar=c(6.3,4,1,1))
plot(B1,las=2, xlab = "")
dev.off()

#Correlation coefficients
z <- cor(preds[,names(B1$finalDecision[B1$finalDecision =='Confirmed'])])

#Variable importance (MeanDecreaseAccuracy)
rf <- randomForest(as.factor(sdata[[1]]) ~ .,data = preds[implist], ntree = 500, importance = TRUE)
rfimp <- as.data.frame(importance(rf, type = 1, scale = FALSE))

rfimp$Var <- rownames(rfimp)
rownames(rfimp) <- NULL
rflist <- list(rfimp[order(rfimp[1],decreasing=T),c(2,1)])

rflist

source("U:/Tools and Scripts/Variable selection/HighstatLibV6.R") 

#Variable inflation factors of important variables
corvif(preds[implist])

#Remove correlated variables (p=0.7)
cr <- cor(preds[rflist[[1]][[1]]])

for(j in 1:length(cr[1,])){
  if (j == 1){
    pl <- c(names(cr[j,][1]),names( cr[j,][sqrt((cr[j,])^2)<0.7]))
    pl1 <- pl
  } else if (names(cr[j,])[j] %in% pl1){
    rem <- names(cr[j,-c(1:j)][sqrt((cr[j,-c(1:j)])^2)>0.7])
    if (length(rem) != 0L){  
      pl <- pl[!pl %in% rem]
    }
  }
  next
}


crval <- as.data.frame(pl)
crval[2] <- corvif(preds[,pl])

predsel <-  as.character(crval[[1]])
clms <- c(names(sdata[1]),predsel)
