## Copyright (c) 2014  Takafumi Kanamori [kanamori@is.nagoya-u.ac.jp]
## All rights reserved.
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions are met:
## - Redistributions of source code must retain the above copyright notice,
##   this list of conditions and the following disclaimer.
## - Redistributions in binary form must reproduce the above copyright notice,
##   this list of conditions and the following disclaimer in the documentation
##   and/or other materials provided with the distribution.
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
## AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
## IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
## ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
## LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
## CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
## SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
## INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
## CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
## ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
## POSSIBILITY OF SUCH DAMAGE.

## Kernel-based density ratio fitting: KuLSIF proposed by T. Kanamori, T. Suzuki, M. Sugiyama,
## Statistical analysis of kernel-based least-squares density-ratio estimation. 
## Machine Learning, vol. 86, Issue 3, pp. 335-367, March 2012.

## required library: "kernlab"
## "snow" is available for parallel computing

## estimation of density ratio with Gaussian kernel
## sigma:  a positive real-value or a vector. For vector, LOOCV is automatically applied.
## lambda: a positive real-value or a vector. For vector, LOOCV is automatically applied.
## dat: de(denominator), nu(numerator). de.data \sim p, nu.data \sim q. (density ratio: r=q/p)
KuLSIF <- function(de,nu,
                   ##sigma=sigest(rbind(de,nu))[2],
                   sigma=sigest(nu)[2],   ## band-width of Gaussian kernel 
                   lambda=1/min(nrow(de),nrow(nu))^0.9,
                   loocv=TRUE,
                   direct=TRUE,
                   cluster=NULL,
                   method="BFGS",
                   control=list(maxit=4000,reltol=10^(-6)),
                   frac=1)
{
  if(is.vector(de)) de <- matrix(de,length(de),1)
  if(is.vector(nu)) nu <- matrix(nu,length(nu),1)
  if(ncol(de)!=ncol(nu)) stop("Dimensions of two data sets do not match!!")
  if(frac<1){
    nde <- ceiling(nrow(de)*frac)
    de <- de[sort(sample(1:nrow(de),nde)),]
  }
  hparameters <- expand.grid(sigma,lambda)
  dimnames(hparameters)[[2]] <- c("sigma","lambda")
  ## select solver
  ## direct=TRUE  ## apply solve()
  if(nrow(de)>=8000)
    direct=FALSE  ## apply optim()
  dat <- list(de=de,nu=nu)
  if((!loocv)&(length(lambda)==1)&(length(sigma)==1)){
    res <- KuLSIF.single(dat,sigma=sigma,lambda=lambda,direct=direct,method=method,control=control)
    res$lcv <- NULL; res$lcv.min <- NULL;

  }else{
    lcv <- LOOCV_KuLSIF(dat,sigma,lambda,cluster=cluster)
    res <- KuLSIF.single(dat,sigma=lcv$opt_sigma,lambda=lcv$opt_lambda,direct=direct,method=method,control=control)
    res$lcv <- lcv$cv; res$lcv.min <- min(lcv$cv);
  }
  res$sigma.list <- sigma; res$lambda.list <- lambda; res$hyper_para <- hparameters
  class(res) <- 'KuLSIF'; res
}
## prediction
## res: output of KuLSIF()
## newdat=matrix(,n,dim): data matrix to be evaluated
## output=list(r=,nr): r = density ratio, nr = normalized density ratio
predict.KuLSIF <- function(res,newdat){
  if(is.vector(newdat))
    newdat <- matrix(newdat,length(newdat),1)
  if(ncol(res$dat$de)!=ncol(newdat))
    stop("dimension does not match!!")
  sigma <- res$sigma; lambda <- res$lambda
  west_f <- kernelMult(rbfdot(sigma=sigma),newdat,res$dat$de,res$par)
  west_s <- kernelMult(rbfdot(sigma=sigma),newdat,res$dat$nu,rep(1/(nrow(res$dat$nu)*lambda),nrow(res$dat$nu)))
  west <- west_f + west_s
  DRest <- c(pmax(west,0))
  list(r=DRest,nr=DRest/mean(DRest))
}


####################
## misc functions ##
####################
KuLSIF.single <- function(dat,sigma=sigest(rbind(dat$de,dat$nu))[2],
                          lambda=1/min(nrow(dat$de),nrow(dat$nu))^0.9,
                          direct=TRUE,
                          method="BFGS",
                          control=list(maxit=4000,reltol=10^(-6))){
  n <- nrow(dat$de); m <- nrow(dat$nu)
  Kxx <- kernelMatrix(rbfdot(sigma=sigma),dat$de)
  Kxy <- kernelMatrix(rbfdot(sigma=sigma),dat$de,dat$nu)
  if(direct){
    par <- c(solve(Kxx+n*lambda*diag(n),-rowMeans(Kxy)/lambda))
  }else{
    fn <- function(alpha)
      sum((Kxx%*%alpha)*alpha)/2+n*lambda*sum(alpha^2)/2+sum(alpha*rowMeans(Kxy))/lambda
    gr <- function(alpha)
      c(Kxx%*%alpha+n*lambda*alpha+rowMeans(Kxy)/lambda)
    par <- rep(0,n)
    op <- optim(par,fn,gr,method=method,control=control)
    par <- op$par
  }
  res <- list(par=par,sigma=sigma,lambda=lambda,dat=dat)
}
## parallel computation of LOOCV for KuLSIF with lambda list
## ncore <- 8
## cluster <- makeCluster(ncore, type = "SOCK")
## clusterExport(cluster,c("kernelMatrix","rbfdot"))
LOOCV_KuLSIF <- function(dat,sigma=sigest(rbind(dat$de,dat$nu))[2],
                         lambda.list=1/min(nrow(dat$de),nrow(dat$nu))^0.9,cluster=NULL){
  hparameters <- expand.grid(sigma,lambda.list)
  dimnames(hparameters)[[2]] <- c("sigma","lambda")
  if (is.null(cluster)){
    cvlist <- apply(hparameters,1,LOOCV_KuLSIF.single_lambda,dat)
  }else{
    clusterExport(cluster,c("kernelMatrix","rbfdot"))
    cvlist <- parApply(cluster,hparameters,1,LOOCV_KuLSIF.single_lambda,dat)
  }
  opt_para <- hparameters[which.min(cvlist),]
  list(lambda=lambda.list,sigma=sigma,cv=cvlist,opt_sigma=as.double(opt_para[1]),opt_lambda=as.double(opt_para[2]))
}

## hpara = c(sigma, lambda)
LOOCV_KuLSIF.single_lambda <- function(hpara,dat){
  sigma <- hpara[1]; lambda <- hpara[2]; 
  n <- nrow(dat$de); m <- nrow(dat$nu)
  E <- matrix(1,m,min(n,m)); diag(E) <- rep(0,min(n,m))
  Kxx <- kernelMatrix(rbfdot(sigma=sigma),dat$de)
  Kxy <- kernelMatrix(rbfdot(sigma=sigma),dat$de,dat$nu)
  Kyy <- kernelMatrix(rbfdot(sigma=sigma),dat$nu)
  ##
  modKxy <- (Kxy[,1:min(n,m)]-rowSums(Kxy))
  G <- solve(Kxx+(n-1)*lambda*diag(1,n))
  S <- modKxy/((m-1)*lambda)
  GS <- G%*%S
  diagT <- (diag(GS)[1:min(n,m)])/(diag(G)[1:min(n,m)])
  A <- GS-t(G[1:min(n,m),]*diagT); B <- E/((m-1)*lambda)
  wx <- pmax(rowSums((cbind(Kxx,Kxy)[1:min(n,m),])*cbind(t(A),t(B))),0)
  wy <- pmax(rowSums((cbind(t(Kxy),Kyy)[1:min(n,m),])*cbind(t(A),t(B))),0)
  ##
  (sum(wx^2)/2-sum(wy))/min(n,m)
}