#' Cluster center
#' 
#' Computes the mean of a given cluster. Internal
#' function for isspc and solsel.
#' 
#' @param y a numeric matrix of data.
#' @param x an index for a specific cluster.
#' @param class a numeric vector of cluster
#' assignments.
#' @param trimprop a proportion for trimming.
#' If trimprop = 0, no trimming is done.
#' @param center "mean" or "median". Defaults to "mean".
#' If "median" no trimming is done.
#' @return A numeric vector representing the mean
#' of the cluster.
#' @export
meanclust <- function(y, x, class, trimprop, center) {
  p <- ncol(y)
	yF <- y[which(class == x), ]
	if(length(yF) > p) {
		if(center == 'median') {
			mu <- apply(yF, 2, median)
			} else { 
			  if(trimprop == 0) {
          mu <- colMeans(yF) 
		      } else {  
			      mu <- apply(yF, 2, function(x) mean(x, trim=trimprop)) 
		        }
			}
	} else { mu <- yF }
	return(mu)
}


#' Cluster variance
#' 
#' Computes the variance of a cluster, assuming
#' constant diagonal covariance.
#' Internal function for isspc.
#'  
#' @param y a numeric matrix of data.
#' @param x an index for a specific cluster.
#' @param class a numeric vector of cluster
#' assignments.
#' @param trimprop a proportion for trimming.
#' If trimprop = 0, no trimming is done.
#' @param center "mean" or "median". Defaults to "mean".
#' If "median" no trimming is done.
#' @return A numeric vector representing the diagonal of the 
#' covariance matrix for a particular cluster.
#' @export
varclust <- function(y, x, class, trimprop, center) {
  p <- ncol(y)
	yF <- y[which(class == x), ]
	if(length(yF) > p) {
		if(center == "median") {
			sumsq <- apply(yF, 2, function(x) sum((x - median(x))^2))	
			} else {
        if(trimprop == 0) { 
          sumsq <- apply(yF, 2, function(x) sum((x - mean(x))^2))  
          } else {
          cutnum <- ceiling(trimprop*nrow(yF))
          cutind <- apply(yF, 2, function(x) order(x)[ c(1:cutnum, (length(x)-cutnum+1):length(x))])
			    yF <- sapply(1:p, function(x,y) y[-cutind[,x], x], y=yF)
			    sumsq <- apply(yF, 2, function(x) sum((x - mean(x))^2))
        }
			}
      S <- sumsq / nrow(yF)
	} else { S <- apply(y , 2, function(x) sum((x - mean(x) )^2)) / nrow(y) }
	
  return(S)
  }


#' Likelihood 
#' 
#' Calculates Gaussian likelihood for a data point based
#' on a constant diagonal covariance. 
#' Internal function for isspc.
#' 
#' @param yi a numeric vector representing a data point.
#' @param mu a numeric vector representing a cluster 
#' center.
#' @param S a numeric vector representing a
#' variance of a cluster. 
#' @param propk a proportion representing a cluster.
#' @return A likelihood value for a given data point
#' and a given cluster. 
#' @export
likhood <- function(yi, mu, S, propk) {
  p <- length(yi)
	normc <- ((2*pi)^(-p/2))*prod(S)^(-1/2)
	SS <- crossprod((yi - mu)*1/S, (yi - mu))
	normd <- normc*exp(-1/2*SS)
	li <- propk*normd
	return(li)	
  }


#' Cluster selection
#' 
#' Selects clusters based on chi-square tests.
#' Internal function for isspc.
#' 
#' @param ytrain a numeric matrix of training data, a
#' subsample of the full data y.
#' @param class a numeric vector of cluster assignments for 
#' the given training data. The length of the vector should be
#' nrow(ytrain).
#' @param use a numeric vector of indices for the training data.
#' Indices represent the position of each training data point
#' in the full data y.
#' @param classall a numeric vector of the cluster assignements
#' based on the full data y. The length of the vector should be
#' nrow(y).
#' @param Beta a cut off for false discovery rate (FDR) that is 
#' greater than 0. Recommended to be in (0.01, 0.1).
#' @param eta an integer in (1, p), where p is the number of 
#' dimensions of the data. Represents the number of 
#' dimensions with the false discovery rate smaller than Beta. 
#' @param Sy a numeric vector representing the diagonal of the covariance 
#' matrix of the full dataset y.
#' @param k0 an integer representing cluster size cut off. 
#' @return Returns a numeric vector of cluster assignments with the
#' rejected clusters assigned to singleton clusters.
#' @export
clustsel <- function(ytrain, class, use, classall, Beta, eta, Sy, k0) {
  p <- ncol(ytrain)
  NKclass <- sapply(unique(class), function(x) length(which(class == x)))
  indclust <- which(NKclass > k0)
  S <- sapply(unique(class)[indclust], function(x) varclust(y=ytrain, x=x, class=class, 
              trimprop=0, center="mean"))

  dof2 <- NKclass[indclust] - 1
  dof1 <- length(classall) - 1
  ms2 <- S
  ms1 <- Sy
  keep <- c()
  cutoff <- c()
  for(asig in 1:length(indclust)) {
    Fkm <- (dof2[asig]*ms2[ , asig]) / ms1
    pval <- pchisq(Fkm, dof2[asig])
    pvalsort <- sort(pval)
    FDR <- (pvalsort*p) / (1:p)
    FDRcut <- which(FDR <= Beta)
    if(length( FDRcut ) > eta) { 
      keep <- c(keep, asig) 
      } else { cutoff <- c(cutoff, asig) }
  }
  
  if(length(cutoff) != 0) {
    indcuti <- which(class %in% indclust[cutoff])
    f1 <- max(classall) + 1
    classall[ use ][indcuti] <- f1:(length(indcuti) + f1 - 1)
    indswitch <- sapply(unique(classall), function(x) which(classall == x), simplify=FALSE)
    f2 <- 1
    for(z in 1:length(indswitch)) {
      classall[indswitch[[z]]] <- f2
      f2 <- f2 + 1
    }
    }
  
  return(list(classall=classall, keep=keep))
  } # end cluster select function


#' Sequential assignment
#' 
#' Assigns test data to existing clusters or noise 
#' sequentially based on likelihood ratio tests. 
#' Internal function for isspc.
#'  
#' @param ytrain a numeric matrix of training data.
#' @param ytest a numeric matrix of test data.
#' @param use a numeric vector of data indices for the training data. 
#' The length of the vector should be nrow(ytrain).
#' @param hold a numeric vector of data indices for the test data.
#' The length of the vector should be nrow(ytest).
#' @param classall a numeric vector of cluster assignments for the
#' full dataset. The length of the vector should be nrow(y).
#' @param k0 an integer representing cluster size cut off. 
#' @param Sy a numeric vector representing the diagonal of the covariance 
#' matrix of the full dataset y.
#' @param mu a numeric vector representing the overall mean of the full
#' dataset y.
#' @param trimprop a proportion of a cluster size to be trimmed for 
#' computing a trimmed mean and variance and subsequently likelihood. 
#' In general, the higher proportions result in a larger number of 
#' tight clusters. If trimprop = 0, no trimming is performed.
#' Recommended not to exceed 0.25.
#' @inheritParams meanclust
#' @return Returns a numeric vector of cluster assignments with the
#' test data assigned to the existing clusters or to 
#' singleton clusters that represent noise.
#' @export 
sequential <- function(ytrain, ytest, use, hold, classall, trimprop, k0, Sy, my, center) {
  ntrain <- nrow(ytrain)
  ntest <- nrow(ytest)
  class <- classall[use]
  NKclass <- sapply(unique(class), function(x) length( which( class == x ) ) )
  clusters <- which(NKclass > k0)
  cind <- which(class %in% unique(class)[clusters] )
  class[-cind] <- 0
  class_c <- class[cind]

  mu <- t(sapply(unique(class_c), function(x) meanclust(y=ytrain, x=x, class=class, 
                                                        trimprop=trimprop, center=center)))
  S <- sapply(unique(class_c), function(x) varclust(y=ytrain, x=x, class=class, trimprop=trimprop,
                                                    center=center), simplify=FALSE)
  mu <- rbind(mu, my)		
  S <- append(S, list(Sy))
  ind <- sapply(unique(class_c), function(x) length(which(class_c == x)))

  label <- c(unique(class_c), 0)
  linew <- c()
  ynew <- ytrain
  nnew <- sum(ind)
  classadd <- class
  propk <- ind / sum(ind)
  for(h in 1:length(hold)) {
    liknew <- c()
    ynew <- rbind(ynew, ytest[ h, ])
    prop <- c(ind / nnew, 1)
    for(ell in 1:length(label)) {
      muk <- mu[ell, ]
      Sk <- S[[ell]]
      lik <- likhood(yi=ytest[h, ], mu=muk, S=Sk, propk=prop[ell])
      liknew <- c(liknew, lik)
    }
  
    liknew_null <- liknew[length(liknew)]
    liknew_clust <- sum(liknew[-length(liknew)])
    if(liknew_clust/liknew_null < 1) {
      classadd <- c(classadd, 0)
      } else {
      cmax <- which.max(liknew[-length(liknew)])
      classadd <- c(classadd, label[cmax])
      uclass <- unique(classadd)
      uclass <- uclass[-which(uclass == 0)]
      mu[cmax, ] <- meanclust(y=ynew, x=label[cmax], class=classadd, trimprop=trimprop, center=center)
      S[[cmax]] <- varclust(y=ynew, x=label[cmax], class=classadd, trimprop=trimprop, center=center)
      ind[cmax] <- ind[cmax] + 1 
      nnew <- nnew + 1
    }			
  } # end computing likelihood ratios for all test data

  #
  f1 <- max(classall) + 1
  classall[use] <- f1 + classadd[1:length(use)]
  classall[hold] <- f1 + classadd[(length(use)+1):(length(use)+length(hold))]
  indk <- which(classall %in% f1)
  lenk <- length(indk) - 1 
  maxk <- max(classall) + 1
  classall[indk] <- (maxk):(maxk + lenk)
  indswitch <- sapply(unique(classall), function(x) which(classall == x), simplify=FALSE)
  f2 <- 1
  for(z in 1:length(indswitch)) {
    classall[indswitch[[z]]] <- f2
    f2 <- f2 + 1
  }

  return(classall)
  } # end sequential function



#' Iterative subsampling in solution path clustering
#' 
#' Performes clustering and estimates the number of 
#' clusters automatically based on recursions of
#' clustering of a subsample of the full dataset 
#' and sequentially assigning the remaining data
#' to the identified clusters. Recommended for large
#' datasets.
#' 
#' @param y a numeric matrix of data. 
#' @param omega1 a value in (0, 1). An approximate proportion 
#' of nearest neighbors expected to merge in the initial solution
#' for the first recursion of the iterative subsampling
#' solution path clustering.
#' @param a a scalar in nu=a*sqrt(n), where n is the size of the data
#' or nrow(y) and nu is subsample size. Recommended to be in [1, 10].
#' @inheritParams clustsel
#' @inheritParams sequential
#' @inheritParams meanclust
#' @param omega2 same as omega1, but used in all recursions greater
#' than 1. Defaults to 0.1. 	
#' @inheritParams spc
#' @param rand TRUE or an integer for a seed number. Should the seed be 
#' generated randomly for subsampling. If a seed number is provided, 
#' a vector of seeds is randomly generated in advance to be used for the iterations.  
#' @return \item{classall}{ a numeric vector of cluster assignments.}
#' \item{centers}{ a numeric matrix of cluster centers.}
#' \item{Kest}{ an estimated number of clusters.}
#' \item{clust}{ the number of clustered data identified.}
#' \item{noise}{ the number of noisy data points identified.}
#' \item{iter}{ a number of recursions or iterations of the iterative
#' subsampling procedure.}
#' \item{randseed}{ a vector of integers for random seeds used for
#' generating subsamples. The length is equal to the number of
#' iterations.}
#' 
#' @references Marchetti, Y., Zhou, Q. Iterative Subsampling in 
#' Solution Path Clustering of Noisy Big Data. arXiv preprint arXiv:1412.1559.
#' 
#' @examples
#' 
#' # simulate a clustered dataset with 50% noise
#' K <- 10
#' p <- 20
#' ncl <- 5000
#' noise <- 5000
#' sim <- simclust(K=K, p=p, ncl=ncl, noise=noise, overnk=0, hcube=c(-5, 5), random=TRUE)
#' 
#' # cluster with iterative subsampling solution path clustering
#' fit <- isspc(y=sim$y, omega1=0.5, a=2, Beta=0.01, eta=10, trimprop=0, rand=6834)
#' ARI(assignV=fit$classall, k0=3, assignT1=sim$assignT1)
#' 
#' @export
isspc <- function(y, omega1, a, Beta, eta, trimprop, center="mean", omega2=0.1, k0=3, phi=0.5, 
                  tau="default", alpha=0.9, H="default", xi=1e-4, out=FALSE, rand=TRUE) {
  if(sum(is.na(y)) > 0) {
    stop("no NaNs allowed in the data")
    }
  n <- nrow(y)
  p <- ncol(y)
  my <- colMeans(y)
  Sy <- apply(y , 2, function(x) sum((x - mean(x) )^2)) / n
  randseed <- c()
  # initialize assignment
  Kest <- c()
  classall <- 1:n
  ntrain <- a*ceiling(sqrt(n))
  cond_break <- FALSE
  iter <- 1
  
  while(!cond_break) {
    NK <- sapply(unique(classall), function(x) length(which(classall == x)))
    nind <- unique(classall)[which(NK <= k0)]
    newind <- which(classall %in% nind)
    cond_break <- length(newind) < 10
    if(cond_break) { break }
    if(ntrain >= length(newind)) { ntrain <- length(newind)  - 2 }
    if(rand == TRUE) {
      randseed <- c(randseed, sample(1:100000, 10000))  
      } else {
          if(length(rand) == 1) {
           set.seed(rand)
           randseed <- c(randseed, sample(1:100000, 10000)) 
           } else {
             stop("rand should be an integer")
             }
      }
      
    set.seed(randseed[iter])
    use <- sort(sample(newind, ntrain))
    hold <- newind[-which(newind %in% use)]
    hold <- sample(hold, size=length(hold)) 
    ytrain <- y[use, ]
    ytest <- y[hold, ]
  
    # clustering
    if(iter > 1) { omega <- omega2 } else { omega <- omega1 }
    fit <- spc(y=ytrain, omega=omega, phi=phi, tau=tau, alpha=alpha, H=H, xi=xi, out=out)
    assignV <- fit$assignV
    NK <- sapply(assignV, function(xx) sapply(unique(xx), function(x) length(which(xx == x))))
    clusters <- sapply(NK, function(x) length(which(x > k0)))
    noise <- sapply(NK, function(x) length(which(x <= k0)))
  
    newm <- which.max(clusters)
    class <- fit$assignV[[newm]]
    f1 <- max(classall, na.rm=TRUE)
    classall[use] <- class + f1
    indswitch <- sapply(unique(classall), function(x) which(classall == x), simplify=FALSE )
    f2 <- 1
    for(z in 1:length(indswitch)) {
      classall[indswitch[[z]]] <- f2
      f2 <- f2 + 1
      }
    NK <- sapply(unique(classall), function(x) length(which(classall == x)))
    
    # select clusters and return the assignment vector
    temp <- clustsel(ytrain=ytrain, class=class, use=use, classall=classall, Beta=Beta, eta=eta, Sy=Sy, k0=k0)
    NK <- sapply(unique(temp$classall), function(x) length(which(temp$classall == x)))
    classall <- temp$classall
    Kest <- c(Kest, length(temp$keep))
    print(paste("iteration ", iter, ", ", Kest[iter], " new clusters", sep=""))
    cond_break <- length(temp$keep) == 0
    if(cond_break) break

    # sequential assignment
    classall <- sequential(ytrain, ytest, use, hold, classall, k0=k0, Sy=Sy, my=my, trimprop=trimprop,
                           center=center)
    iter <- iter + 1
    } # end iterative subsampling
  
  NK <- sapply(unique(classall), function(x) length(which(classall == x)))
  indK <- which(NK > k0)
  clustind <- unique(classall)[indK]
  centers <- sapply(clustind, function(x) colMeans(matrix(y[which(classall == x), ], ncol=p)))
  clustered <- sum(NK[indK])
  noisy <- sum(NK[which(NK <= k0)])
  
  return(list(classall=classall, centers=centers, Kest=sum(Kest), clust=clustered, noise=noisy, 
              iter=iter, randseed=randseed[1:iter]))
  }  



