#' Cluster simulation
#' 
#' Simulate spherical, equal variance clusters with or without 
#' noise and overlap. 
#' 
#' @param K a desired number of clusters.
#' @param p a desired number of dimensions.
#' @param ncl a desired number of clustered data points.
#' @param noise a desired number of noisy data points.
#' @param overnk how many clusters should overlap. Overlapping 
#' clusters are generated such that 15-20\% of 
#' the data points in a pair of clusters are located within the 
#' radiuses of both clusters.
#' @param hcube an interval from where the clusters centers
#' should be randomly generated.
#' @param random TRUE or an integer for a seed number. If a seed number is 
#' specified, a vector of random seeds is generated in advance for all 
#' simulations, including cluster centers and the data points themselves.
#' @return \item{y}{ a numeric matrix of data.}
#' \item{assignT1}{ a numeric vector of cluster assignments. Noise represented
#' as singleton clusters and added at the end of the dataset.}
#' \item{assignT2}{ a numeric vector of assignements to either clusters or noise. 
#' 1 -- for clustered data points, 2 -- for noise.}
#' \item{osd}{ variance used to generate the clusters, equal for all clusters.}
#' 
#' @references Marchetti, Y., Zhou, Q. Solution path clustering with adaptive concave penalty. 
#' Electron. J. Statist. 8 (2014), no. 1, 1569--1603.
#' 
#' @examples
#' 
#' # simulate clusters in two dimensions, with 10% noise and no overlap,
#' # set the random seed
#' K <- 3
#' p <- 2
#' ncl <- 300
#' noise <- 30
#' sim <- simclust(K=K, p=p, ncl=ncl, noise=noise, overnk=0, hcube=c(-5, 5), random=4787)
#' 
#' # scatter plot of the resulting simulated clusters,
#' plot(sim$y, col=c(sim$assignT1[1:ncl], rep(8, noise)), pch=c(rep(1, ncl), rep(4, noise)))
#' 
#' # same clusters but with overlap
#' sim <- simclust(K=K, p=p, ncl=ncl, noise=noise, overnk=1, hcube=c(-5, 5), random=4787)
#' plot(sim$y, col=c(sim$assignT1[1:ncl], rep(8, noise)), pch=c(rep(1, ncl), rep(4, noise)))   
#'    
#' @export       
simclust <- function(K, p, ncl, noise, overnk, hcube, random=TRUE) {	
  library(fields)
  library(MixSim)
  
  if(noise == 0) {
    scatter <- FALSE
    } else { scatter <- TRUE }
  if(overnk == 0) {
    wellsep <- TRUE
    } else { wellsep <- FALSE }
  # cluster overlap function
	overFUNC <- function(assignT, y, K) {
		over <- matrix( NA, K, K )
		olen <- matrix( NA, K, K )
		over2 <- matrix( NA, K, K )
		olen2 <- matrix( NA, K, K )
		for( k in 1:K ) {
		  yCl <- y[ which( assignT == k ), ]
			meanCl <- t( as.matrix( colMeans( yCl ) ) )
			distCl <- max( rdist( meanCl, yCl ) )
			distCl2 <- sort(rdist( meanCl, yCl ),decreasing=TRUE)[2]
			for( k1 in 1:K ) {
				yClcomp <- y[ which( assignT == k1 ), ]
				distClcomp <- rdist( meanCl, yClcomp )
				overind <- which( distClcomp <= distCl )
				overind2 <- which( distClcomp <= distCl2 )
				over[ k, k1 ] <- length( overind )/nrow( yClcomp )
				olen[ k, k1 ] <- length( overind )
				over2[ k, k1 ] <- length( overind2 )/nrow( yClcomp )
				olen2[ k, k1 ] <- length( overind2 )
				}
		  }
		return( list( olen=olen, over=over ) )
	  }


	overmatFUNC <- function( assignT, y, K ) {
		overmat <- overFUNC( assignT=assignT, y=y, K=K )$olen
		overmat0 <- overmat
			diag( overmat0 ) <- 0
			ccl <- which( overmat0 > 0, arr.ind=T )
			if( length( ccl ) > 0 ) { xxx1 <- FALSE} else { xxx1 <- TRUE }
			if( sum( duplicated( ccl[, 1] ) ) > 0 ) { 
				dupind <- which( duplicated( ccl[,1 ] ) )
				dupind1 <- which( ccl[ , 1] == ccl[ dupind, 2 ] )
				if( length( dupind1 ) > 0 ) { dupind <- which( duplicated( ccl[,1 ], fromLast=TRUE ) ) }
				ccl[ dupind, ] <- rev( ccl[ dupind, ] )
				}
			while( xxx1 == FALSE ) {
				xxx <- sapply( 1:nrow(ccl), function(x) which( ccl[x,1] == ccl[,2] & which( ccl[x,2] == ccl[,1]) ) )
				temp1 <- sapply( xxx, function(x)  ifelse( length( x ) == 0, 0, x ) )
				xxx1 <- is.na( which( temp1 > 0 )[1] )
				if( xxx1 ) break
				ccl <- ccl[ -which( temp1 > 0 )[1], ]
				if( length( ccl ) <= 2 ) { 
					ccl <- t( as.matrix( ccl ) )
					xxx1 <- TRUE }
				}

			ccl <- t( ccl )	
			overInd1 <- diag( as.matrix( overmat[ ccl[2, ], ccl[1, ] ] )  )
			overInd2 <- diag( as.matrix( overmat[ ccl[1, ], ccl[2, ] ] )  )
			sumO <- colSums( rbind( overInd1, overInd2 ) )
			sumK <- colSums( rbind( diag(overmat)[ccl[1,]], diag(overmat)[ccl[2,]] ) )
			over0 <- sumO/sumK
		
			return( list( overmat=overmat, over0=over0, ccl=ccl ) )
	}

  # set random seeds 
  if(random == TRUE) {
    stseed <- sample(1:1000000, 1)
  } else {
    set.seed(random)
    stseed <- sample(1:1000000, 1)
    }

	set.seed(stseed)
	ranseeds <- sample( 100000:999999, 3)

	set.seed(ranseeds[1])
	mm <- matrix(runif( 100*p, min=hcube[1], max=hcube[2] ), ncol=p)
	mK <- mm[sample(1:100, size=K), ]

	distm <- rdist(mK)
	diag(distm) <- NA
	nn <- apply(distm, 2, function(x) min(x, na.rm=TRUE))

	### equal variance -- half the distance and >= 5 standard deviations
	osd <- rep(min(nn)/2/5, K)
	Pi <- rep( K/ncl, K )
	S <- array( 0, dim=c( p, p, K ) )
	for(k in 1:K) { diag( S[ , , k]) <- osd[k]  }
	
	if(wellsep) {	
		set.seed(ranseeds[2])
		A <- simdataset(n = ncl, Pi = Pi, Mu = mK, S = S)
		y <- A$X
		assignT <- A$id
		assignT1 <- assignT
		assignT2 <- rep(1, ncl)
		if( scatter ) { 
				set.seed( ranseeds[2] )
				A <- simdataset(n = ncl, Pi = Pi, Mu = mK, S = S, n.out=noise, int=hcube)
				y <- A$X
				assignT <- A$id
				assignT[ (ncl+1):(ncl+noise) ] <- K+1
				assignT1 <- assignT
				assignT1[ (ncl+1):(ncl+noise) ] <- (K+1):(noise + K)
				assignT2 <- c(rep(1, ncl), rep(2, noise ))
				}
	  } else {
		### overlap and equal variance
		nnO <- apply( distm, 2, function(x) which( x == min( x, na.rm=TRUE ) ) )
		ccl <- rbind(1:K, nnO)

		### move some clusters to overlap
		set.seed(ranseeds[3])
		moveOrig <- sample(1:K, overnk)
		move <- moveOrig
		cff <- 0.05
		stopM <- TRUE

		while(stopM) {
			for(j in 1:length(move)) {
				mKmove <- mK[ccl[ , move[j]], ]
				movestep <- apply( mKmove, 2, diff )*cff
				mK[ ccl[ , move[j] ][ 1 ], ] <- mKmove[ 1, ] + movestep
				}

		set.seed(ranseeds[2])
		A <- simdataset(n = ncl, Pi = Pi, Mu = mK, S = S)
		y <- A$X
		assignT <- A$id
		assignT1 <- assignT
		assignT2 <- c(rep(1, ncl))
		overmat <- overFUNC( assignT=assignT, y=y, K=K )$olen
		overInd1 <- diag( as.matrix( overmat[ ccl[2,move], ccl[1,move] ] )  )
		overInd2 <- diag( as.matrix( overmat[ ccl[1,move], ccl[2,move] ] )  )
		sumO <- colSums( rbind( overInd1, overInd2 ) )
		sumK <- colSums( rbind( diag(overmat)[ccl[1,move]], diag(overmat)[ccl[2,move]] ) )
		over0 <- sumO/sumK
		overIND <- which( over0 > 0.10 )
		if( length( overIND ) > 0 )  { move <- move[ - overIND ] 
			} else { move <- move }
		if( length( move ) > 0 ) { 
			stopM <- TRUE
			} else { stopM <- FALSE }
		}	

		### final overlap after merge
		overmat <- overFUNC( assignT=assignT, y=y, K=K )$olen
		overInd1 <- diag( as.matrix( overmat[ ccl[2,moveOrig], ccl[1,moveOrig] ] )  )
		overInd2 <- diag( as.matrix( overmat[ ccl[1,moveOrig], ccl[2,moveOrig] ] )  )
		sumO <- colSums( rbind( overInd1, overInd2 ) )
		sumK <- colSums( rbind( diag(overmat)[ccl[1,moveOrig]], diag(overmat)[ccl[2,moveOrig]] ) )
		over0 <- sumO/sumK

		### overlapping with noise
		if( scatter ) {
			set.seed( ranseeds[2] )
			A <- simdataset( n = ncl, Pi = Pi, Mu = mK, S = S, n.out=noise, int=hcube )
			y <- A$X
			assignT <- A$id
			assignT[ (ncl+1):(ncl+noise) ] <- K+1
			assignT1 <- assignT
			assignT1[ (ncl+1):(ncl+noise) ] <- (K+1):( noise+K )
			assignT2 <- c(rep(1, ncl), rep(2, noise))
			}
	}

  # function returns the clustered data y, its assignment to corresponding clusters K
  # and noise K+1 assignT1,
  # assignment to two clusters (1 = clusters, 2 = noise) only assignT2,
  # and equal cluster variance osd
	return(list(y=y, assignT1=assignT1, assignT2=assignT2, osd=osd[1]))

} # end function





