########################################################################### # SMSVM v1.2.1: R functions written by SangJun Lee, Yuwon Kim, Yoonkyung # Lee, and Ja-Yong Koo. ########################################################################### library(quadprog) library(lpSolve) smsvm <- function(x, y, kernel.type, kernel.par = 1, lambda, lambda.theta, criterion = "hinge", x.test = NULL, y.test = NULL, cv = FALSE, fold = 5, epsilon = 1e-3, epsilon.H = 1e-5, isCombined = TRUE) { # initialize cat('----------\n') cat("Initialize \n") cat('----------\n') cat("c-step...\n") cstepResult <- cstep(x, y, lambda, kernel.type, kernel.par, theta = NULL, criterion, x.test, y.test, cv, fold, epsilon, epsilon.H) opt.lambda <- cstepResult$opt.lambda # first update cat('------------\n') cat("First Update \n") cat('------------\n') if(isCombined == TRUE) # combined method { cat("theta-step and c-step...", "\n") thetastepResult <- thetastep(x, y, opt.lambda, lambda.theta, kernel.type, kernel.par, criterion, x.test, y.test, cv, fold, epsilon, epsilon.H, T) opt.theta <- thetastepResult$opt.theta final.model <- thetastepResult$model return(list(cstep0 = cstepResult, thetastep1 = thetastepResult, opt.theta = opt.theta, model = final.model)) } else if(isCombined == FALSE) # sequential method { cat("theta-step...", "\n") thetastepResult <- thetastep(x, y, opt.lambda, lambda.theta, kernel.type, kernel.par, criterion, x.test, y.test, cv, fold, epsilon, epsilon.H, F) opt.theta <- thetastepResult$opt.theta cat("c-step...", "\n") cstep1Result <- cstep(x, y, lambda, kernel.type, kernel.par, opt.theta, criterion, x.test, y.test, cv, fold, epsilon, epsilon.H) final.model <- cstep1Result$model return(list(cstep0 = cstepResult, thetastep1 = thetastepResult, cstep1 = cstep1Result, opt.theta = opt.theta, model = final.model)) } } cstep <- function(x, y, lambda, kernel.type, kernel.par= 1, theta = NULL, criterion = "hinge", x.test = NULL, y.test = NULL, cv = FALSE, fold = 5, epsilon = 1e-3, epsilon.H = 1e-5) { if((criterion != "0-1") && (criterion != "hinge")) { cat("ERROR: Only 0-1 and hinge can be used as criterion!", "\n") return(NULL) } len.lambda <- length(lambda) ERR <- matrix(0, len.lambda, 1) HIN <- matrix(0, len.lambda, 1) kernel <- list(type=kernel.type, par=kernel.par) x <- as.matrix(x) anova.kernel <- make.anovaKernel(x, x, kernel) if (is.null(theta)) { theta <- matrix(1, anova.kernel$numK, 1)} K <- combine.kernel(anova.kernel, theta) if (cv & !is.null(y.test)) { cat('When cv=TRUE, the test data are not used in cross-validation.','\n') } if (cv) # cross-validation { ran <- data.split(y, fold) for(i.cv in 1:fold ) { cat("Leaving subset[", i.cv,"] out in",fold,"fold CV:","\n") omit <- (ran == i.cv) x.train <- x[!omit,] y.train <- y[!omit] x.test <- x[omit,] y.test <- y[omit] row.index <- 0 subanova.kernel <- make.anovaKernel(x.train, x.train, kernel) subK <- combine.kernel(subanova.kernel, theta) subanova.kernel.test <- make.anovaKernel(x.test, x.train, kernel) subK.test <- combine.kernel(subanova.kernel.test, theta) cat("lambda of length",len.lambda,"|") for(i in lambda) { row.index <- row.index + 1 exp2.lambda <- 2^i model <- msvm.compact(subK, y.train, exp2.lambda, epsilon, epsilon.H) fit.test <- predict.msvm.compact(subK.test, model) ERR[row.index] <- (ERR[row.index]+error.rate(y.test, fit.test)/fold) HIN[row.index] <- (HIN[row.index]+hinge(y.test, fit.test)/fold) cat('*') } cat("|\n") } cat("The minimum of average", fold, "fold cross-validated", criterion, "loss:","\n") } else if ( !cv & is.null(y.test) ) # in-sample evaluation { row.index <- 0 cat("lambda of length",len.lambda,"|") for(i in lambda) { row.index <- row.index + 1 exp2.lambda <- 2^i model <- msvm.compact(K, y, exp2.lambda, epsilon, epsilon.H) ERR[row.index] <- error.rate(y, model$fit) HIN[row.index] <- hinge(y, model$fit) cat('*') } cat('|\n') cat("The minimum of average in-sample", criterion,"loss:", "\n") } else # use the test data for tuning { x.test <- as.matrix(x.test) anova.kernel.test <- make.anovaKernel(x.test, x, kernel) K.test <- combine.kernel(anova.kernel.test, theta) row.index <- 0 cat("lambda of length",len.lambda,"|") for(i in lambda) { row.index <- row.index + 1 exp2.lambda <- 2^i model <- msvm.compact(K, y, exp2.lambda, epsilon, epsilon.H) fit.test <- predict.msvm.compact(K.test, model) ERR[row.index] <- error.rate(y.test, fit.test) HIN[row.index] <- hinge(y.test, fit.test) cat('*') } cat('|\n') cat("The minimum of average", criterion,"loss over test cases:", "\n") } # choose the optimal index for lambda # if the optimal values are not unique, choose the largest value # assuming that lambda is in increasing order. if(criterion == "0-1") { optIndex <- (len.lambda:1)[which.min(ERR[len.lambda:1])] cat(min(ERR),"\n") } else if(criterion == "hinge") { optIndex <- (len.lambda:1)[which.min(HIN[len.lambda:1])] cat(min(HIN),"\n") } # choose the best model opt.lambda <- lambda[optIndex] cat("The optimal lambda on log2 scale:", opt.lambda,"\n\n") opt.model <- msvm.compact(K, y, 2^opt.lambda, epsilon, epsilon.H) list(opt.lambda = opt.lambda, error = ERR, hinge = HIN, model = opt.model) } thetastep <- function(x, y, opt.lambda, lambda.theta, kernel.type, kernel.par = 1, criterion = "hinge", x.test = NULL, y.test = NULL, cv = FALSE, fold = 5, epsilon = 1e-3, epsilon.H = 1e-5, isCombined = TRUE, pretheta = NULL) { if((criterion != "0-1") && (criterion != "hinge")) { cat("Only 0-1 and hinge can be used as criterion!", "\n") return(NULL) } len.lambda.theta <- length(lambda.theta) ERR <- matrix(0, len.lambda.theta, 1) HIN <- matrix(0, len.lambda.theta, 1) exp2.lambda <- 2^opt.lambda kernel <- list(type=kernel.type, par=kernel.par) x <- as.matrix(x) anova.kernel <- make.anovaKernel(x, x, kernel) if (is.null(pretheta)) { pretheta <- matrix(1, anova.kernel$numK, 1)} K <- combine.kernel(anova.kernel, pretheta) initial.model <- msvm.compact(K, y, exp2.lambda, epsilon, epsilon.H) theta.seq <- matrix(0, len.lambda.theta, anova.kernel$numK) if (cv & !is.null(y.test)) { cat('When cv=TRUE, the test data are not used in cross-validation.','\n') } if (cv) # cross-validation { ran <- data.split(y, fold) for(i.cv in 1:fold) { cat("Leaving subset[", i.cv,"] out in",fold,"fold CV:","\n") omit <- (ran == i.cv) x.train <- x[!omit,] y.train <- y[!omit] x.test <- x[omit,] y.test <- y[omit] subanova.kernel <- make.anovaKernel(x.train, x.train, kernel) subanova.kernel.test <- make.anovaKernel(x.test, x.train, kernel) subK <- combine.kernel(subanova.kernel, pretheta) model.initial <- msvm.compact(subK, y.train, exp2.lambda, epsilon, epsilon.H) row.index <- 0 cat("lambda.theta of length",len.lambda.theta,"|") for(i in lambda.theta) { row.index <- (row.index + 1) exp2.lambda.theta <- 2^i model <- model.initial # find the optimal theta vector theta <- find.theta(y.train, subanova.kernel, model$cmat, model$bvec, exp2.lambda, exp2.lambda.theta) # combine kernels subK <- combine.kernel(subanova.kernel, theta) if(isCombined == TRUE) # combined method { model <- msvm.compact(subK, y.train, exp2.lambda, epsilon, epsilon.H) } subK.test <- combine.kernel(subanova.kernel.test, theta) fit.test <- predict.msvm.compact(subK.test, model) ERR[row.index] <- (ERR[row.index]+error.rate(y.test, fit.test)/fold) HIN[row.index] <- (HIN[row.index]+hinge(y.test, fit.test)/fold) cat('*') } cat('|\n') } cat("The minimum of average", fold, "fold cross-validated", criterion, "loss:","\n") # generate a sequence of theta vectors model <- initial.model row.index <- 0 for(i in lambda.theta) { row.index <- (row.index + 1) exp2.lambda.theta <- 2^i theta <- find.theta(y, anova.kernel, model$cmat, model$bvec, exp2.lambda, exp2.lambda.theta) theta.seq[row.index,] <- theta } } else if ( !cv & is.null(y.test) ) # in-sample evaluation { cat("lambda.theta of length",len.lambda.theta,"|") row.index <- 0 for(i in lambda.theta) { row.index <- (row.index + 1) exp2.lambda.theta <- 2^i model <- initial.model theta <- find.theta(y, anova.kernel, model$cmat, model$bvec, exp2.lambda, exp2.lambda.theta) # combine kernels K <- combine.kernel(anova.kernel, theta) if(isCombined == TRUE) #combined method { model <- msvm.compact(K, y, exp2.lambda, epsilon, epsilon.H) } else { model$fit <- predict.msvm.compact(K, model) } ERR[row.index] <- error.rate(y, model$fit) HIN[row.index] <- hinge(y, model$fit) theta.seq[row.index,] <- theta cat('*') } cat('|\n') cat("The minimum of average in-sample", criterion,"loss:", "\n") } else # use the test data for tuning { x.test <- as.matrix(x.test) anova.kernel.test <- make.anovaKernel(x.test, x, kernel) cat("lambda.theta of length",len.lambda.theta,"|") row.index <- 0 for(i in lambda.theta) { row.index <- (row.index + 1) exp2.lambda.theta <- 2^i model <- initial.model theta <- find.theta(y, anova.kernel, model$cmat, model$bvec, exp2.lambda, exp2.lambda.theta) # combine kernels K <- combine.kernel(anova.kernel, theta) if(isCombined == TRUE) #combined method { model <- msvm.compact(K, y, exp2.lambda, epsilon, epsilon.H) } K.test <- combine.kernel(anova.kernel.test, theta) fit.test <- predict.msvm.compact(K.test, model) ERR[row.index] <- error.rate(y.test, fit.test) HIN[row.index] <- hinge(y.test, fit.test) theta.seq[row.index,] <- theta } cat('|\n') cat("The minimum of average", criterion,"loss over test cases:", "\n") } # if the optimal values are not unique, choose the largest value # assuming that lambda.theta is in increasing order. if(criterion == "0-1") { optIndex <- (len.lambda.theta:1)[which.min(ERR[len.lambda.theta:1])] cat(min(ERR),"\n") } else if(criterion == "hinge") { optIndex <- (len.lambda.theta:1)[which.min(HIN[len.lambda.theta:1])] cat(min(HIN),"\n") } opt.lambda.theta <- lambda.theta[optIndex] cat("The optimal lambda.theta on log2 scale:", opt.lambda.theta,"\n") opt.model <- initial.model opt.theta <- find.theta(y, anova.kernel, opt.model$cmat, opt.model$bvec, exp2.lambda, 2^opt.lambda.theta) nsel <- sum(opt.theta>0) shrinkage <- mean(opt.theta) cat("The number of selected features out of",anova.kernel$numK,":",nsel,"\n") cat("The average shrinkage factor:",shrinkage,"\n\n") K <- combine.kernel(anova.kernel, opt.theta) if(isCombined == TRUE) #combined method { opt.model <- msvm.compact(K, y, exp2.lambda, epsilon, epsilon.H) } list(lambda.theta = lambda.theta, opt.lambda.theta = opt.lambda.theta, error = ERR, hinge = HIN, opt.theta = opt.theta, model = opt.model, nsel = nsel, shrinkage = shrinkage, theta.seq = theta.seq) } main.kernel <- function(x, u, kernel) { x <- as.matrix(x) u <- as.matrix(u) if (kernel$type == "linear") K <- (x %*% t(u)) if (kernel$type == "poly") K <- (1 + x %*% t(u))^kernel$par if (kernel$type == "rbf") { a <- as.matrix(apply(x^2, 1, 'sum')) b <- as.matrix(apply(u^2, 1, 'sum')) one.a <- matrix(1, ncol=length(b)) one.b <- matrix(1, ncol=length(a)) K1 <- one.a %x% a K2 <- x %*% t(u) K3 <- t(one.b %x% b) K <- exp(-(K1 - 2 * K2 + K3)/(2 * kernel$par^2)) } return(K) } spline.kernel <- function(x, u) { x <- as.matrix(x) u <- as.matrix(u) K1x <- (x - 1/2) K1u <- (u - 1/2) K2x <- (K1x^2 - 1/12)/2 K2u <- (K1u^2 - 1/12)/2 ax <- x%x%matrix(1, 1, nrow(u)) au <- u%x%matrix(1, 1, nrow(x)) b <- abs(ax - t(au)) K1 <- K1x%x%t(K1u) K2 <- K2x%x%t(K2u) - ((b - 1/2)^4 - (b - 1/2)^2/2 + 7/240)/24 list(K1 = K1, K2 = K2) } unit.scale <- function(x, limit = NULL) { x <- as.matrix(x) if (is.null(limit)) { r <- apply(x,2,range) x <- scale(x, r[1,] - .05*(r[2,]-r[1,]), 1.1*(r[2,]-r[1,])) } else { r <- limit x <- scale(x, r[1,] - .05*(r[2,]-r[1,]), 1.1*(r[2,]-r[1,])) x[x<0] <- 0 x[x>1] <- 1 } return(x) } make.anovaKernel <- function(x, u, kernel) { if (!is.matrix(x)) # degenerate case: x is a row vector { x <- t(as.matrix(x))} else { x <- as.matrix(x)} u <- as.matrix(u) dimx <- ncol(x) # calculate anova kernels for main effects if(kernel$type == "spline") { # assign the number of anova kernels numK <- 2*dimx # list of kernel matrices anova.kernel <- vector(mode="list", numK) # list of kernel coordinate indices kernelCoord <- vector(mode="list", numK) index <- 0 for (d in 1:dimx) { index <- index + 1 A <- as.matrix(x[,d]) B <- as.matrix(u[,d]) K.temp <- spline.kernel(A, B) anova.kernel[[index]] <- K.temp$K1 kernelCoord[[index]] <- paste("x", d, " linear", sep="") index <- index + 1 anova.kernel[[index]] <- K.temp$K2 kernelCoord[[index]] <- paste("x", d, " smooth", sep="") } } else if (kernel$type == 'spline2') { numK <- (2*dimx) + (2*dimx*(2*dimx-1)/2 - dimx) anova.kernel <- vector(mode="list", numK) kernelCoord <- vector(mode="list", numK) index <- 0 # main effects for(d in 1:dimx) { index <- index + 1 A <- as.matrix(x[,d]) B <- as.matrix(u[,d]) K.temp <- spline.kernel(A, B) anova.kernel[[index]] <- K.temp$K1 kernelCoord[[index]] <- paste("x", d, " linear", sep="") index <- index + 1 anova.kernel[[index]] <- K.temp$K2 kernelCoord[[index]] <- paste("x", d, " smooth", sep="") } # two-way interactions for (i in 1:(dimx-1)) { for (j in (i+1):dimx) { index <- index + 1 A.linear <- as.matrix(anova.kernel[[2*i-1]]) A.smooth <- as.matrix(anova.kernel[[2*i]]) B.linear <- as.matrix(anova.kernel[[2*j-1]]) B.smooth <- as.matrix(anova.kernel[[2*j]]) anova.kernel[[index]] <- A.linear*B.linear kernelCoord[[index]] <- paste("x", i, " linear,", " x", j, " linear", sep="") index <- index + 1 anova.kernel[[index]] <- A.linear*B.smooth kernelCoord[[index]] <- paste("x", i, " linear,", " x", j, " smooth", sep="") index <- index + 1 anova.kernel[[index]] <- A.smooth*B.linear kernelCoord[[index]] <- paste("x", i, " smooth,", " x", j, " linear", sep="") index <- index + 1 anova.kernel[[index]] <- A.smooth*B.smooth kernelCoord[[index]] <- paste("x", i, " smooth,", " x", j, " smooth", sep="") } } } else if (kernel$type == "spline-t") { numK <- dimx anova.kernel <- vector(mode="list", numK) kernelCoord <- vector(mode="list", numK) index <- 0 for (d in 1:dimx) { index <- index + 1 A <- as.matrix(x[,d]) B <- as.matrix(u[,d]) K.temp <- spline.kernel(A, B) anova.kernel[[index]] <- (K.temp$K1 + K.temp$K2) kernelCoord[[index]] <- paste("x", d, sep="") } } else if (kernel$type == 'spline-t2') { numK <- dimx+dimx*(dimx-1)/2 anova.kernel <- vector(mode="list", numK) kernelCoord <- vector(mode="list", numK) index <- 0 for (d in 1:dimx) { index <- index + 1 A <- as.matrix(x[,d]) B <- as.matrix(u[,d]) K.temp <- spline.kernel(A, B) anova.kernel[[index]] <- (K.temp$K1 + K.temp$K2) kernelCoord[[index]] <- paste("x", d, sep="") } for (i in 1:(dimx-1)) { for (j in (i+1):dimx) { index <- index + 1 A <- anova.kernel[[i]] B <- anova.kernel[[j]] anova.kernel[[index]] <- A*B kernelCoord[[index]] <- paste("x", i, " x", j, sep="") } } } else { numK <- dimx anova.kernel <- vector(mode="list", numK) kernelCoord <- vector(mode="list", numK) for (d in 1:dimx) { A <- as.matrix(x[,d]) B <- as.matrix(u[,d]) anova.kernel[[d]] <- main.kernel(A, B, kernel) kernelCoord[[d]] <- paste("x", d, sep="") } } list(K = anova.kernel, coord = kernelCoord, numK = numK, kernel = kernel) } combine.kernel <- function(anova.kernel, theta = rep(1, anova.kernel$numK)) { K <- 0 for (d in 1:anova.kernel$numK) { K <- (K + theta[d]*anova.kernel$K[[d]]) } return(K) } find.theta <- function(y, anova.kernel, cmat, bvec, lambda, lambda.theta) { if (anova.kernel$numK == 1) { cat("Only one kernel", "\n") return(c(1)) } # standard LP form : # min a^T x , subject to A1x <<- a1 n.data <- length(y) n.class <- max(y) bvec <- as.matrix(bvec) # convert y into msvm class code trans.Y <- class.code(y, n.class) # calculate the 'a' matrix a <- matrix(trans.Y, ncol=1) a[a==1] <- 0 a[a<0] <- 1/n.data # indices for the nonzero elements nonzeroIndex <- matrix((a != 0), ncol=1) a <- as.matrix(a[nonzeroIndex]) # initialize M M <- matrix(rep(0, anova.kernel$numK), ncol=1) # calculate M for (d in 1:anova.kernel$numK) { for (j in 1:n.class) { M[d] <- (M[d] + t(cmat[,j])%*%anova.kernel$K[[d]]%*%cmat[,j]) } M[d] <- (lambda/2*M[d] + (lambda.theta)) } a <- rbind(a, M) # calculate N matrix for (d in 1:anova.kernel$numK) { K <- anova.kernel$K[[d]] for (j in 1:n.class) { if(j == 1) { temp.N <- K%*%cmat[,j] } else { temp.N <- rbind(temp.N, K%*%cmat[,j]) } } if(d == 1) { N <- temp.N } else { N <- cbind(N, temp.N) } } # constraints n.nonzeroIndex <- sum(as.numeric(nonzeroIndex)) # A matrix I.nonzeroIndex <- diag(1, n.nonzeroIndex) N.nonzeroIndex <- as.matrix(N[nonzeroIndex,]) A.theta <- cbind(matrix(0, anova.kernel$numK, n.nonzeroIndex), diag(-1, anova.kernel$numK)) A.ineq <- rbind(cbind(I.nonzeroIndex, -N.nonzeroIndex), A.theta) b.t1 <- matrix(rep(1, n.data), ncol=1)%x%t(bvec) b.t2 <- matrix(b.t1 - trans.Y, ncol=1) b.nonzeroIndex <- b.t2[nonzeroIndex] b.ineq <- rbind(as.matrix(b.nonzeroIndex), matrix(-1, anova.kernel$numK, 1)) # constraint directions const.dir <- matrix(rep(">=", nrow(matrix(b.ineq)))) # find solution by LP lp <- lp("min", objective.in=a, const.mat=A.ineq, const.dir=const.dir, const.rhs=b.ineq)$solution # find the theta vector only from the solution theta <- cbind(matrix(0, anova.kernel$numK, n.nonzeroIndex), diag(1, anova.kernel$numK))%*%matrix(lp,ncol=1) return(as.vector(theta)) } msvm.compact <- function(K, y, lambda, epsilon = 1e-3, epsilon.H = 1e-5) { # the sample size, the number of classes and dimension of QP problem n.class <- max(y) n.data <- length(y) qp.dim <- (n.data * n.class) # convert y into msvm class code trans.Y <- class.code(y, n.class) # optimize alpha by solve.QP: min(-d^T b + 1/2 b^T D b) # subject to A^T b ><- b_0 # following steps (1)-(6) # (1) preliminary quantities Jk <- matrix(1, nrow=n.class, ncol=n.class) Ik <- diag(1, n.class) # vectorize y matrix y.vec <- as.vector(trans.Y) # index for non-trivial alphas nonzeroIndex <- (y.vec != 1) # (2) compute D <-H H <- ((Ik - Jk/n.class) %x% K) # subset the columns and rows for non-trivial alpha's Reduced.H <- H[nonzeroIndex, nonzeroIndex] # add a small number for stability diag(Reduced.H) <- (diag(Reduced.H) + epsilon.H) # (3) compute d <- g g <- (-n.data * lambda * y.vec) # subset the components with non-trivial alpha's Reduced.g <- g[nonzeroIndex] n.nonzeroIndex <- length(Reduced.g) # (4) compute A <- R # equality constraint matrix R1 <- ((Ik - Jk/n.class) %x% matrix(rep(1, n.data), nrow=1)) # eliminate one redundant equality constraint R1 <- matrix(R1[1:(n.class - 1), ], nrow = (n.class - 1), ncol = ncol(R1)) # choose components with non-trivial alpha's Reduced.R1 <- matrix(R1[, nonzeroIndex], nrow = nrow(R1), ncol = n.nonzeroIndex) # inequality constraint matrix R2 <- diag(rep(1, n.data*(n.class-1))) R2 <- rbind(R2, -R2) # R consists of equality and inequality constraints R <- t(rbind(Reduced.R1, R2)) # (5) compute (b_0, b) <- r # right hand side of equality constraints r1 <- rep(0, nrow(Reduced.R1)) # right hand side of inequality constraints r2 <- c(rep(0, nrow(R2)/2), rep(-1, nrow(R2)/2)) # R consists of right hand sides of equality and inequality constraints r <- c(r1, r2) # (6) find solution by solve.QP.compact nonzero <- find.nonzero(R) Amat <- nonzero$Amat.compact Aind <- nonzero$Aind dual <- solve.QP.compact(Reduced.H, Reduced.g, Amat, Aind, r, meq=nrow(Reduced.R1)) # place the dual solution into the non-trivial alpha positions alpha <- rep(0, qp.dim) alpha[nonzeroIndex] <- dual$solution # make alpha zero if they are too small alpha[alpha < 0] <- 0 alpha[alpha > 1] <- 1 # reshape alpha into a n.data by n.class matrix alpha <- matrix(alpha, nrow=n.data) # compute cmat <- matrix of estimated coefficients cmat <- -((alpha - matrix(rep(rowMeans(alpha), n.class), ncol=n.class))/(n.data*lambda)) # find b vector Kcmat <- (K %*% cmat) flag<-T while(flag){ logic <- ((alpha > epsilon) & (alpha < (1-epsilon))) bvec <- numeric(n.class) if (all(colSums(logic) > 0)) { # Using alphas between 0 and 1, we get bvec by KKT conditions for (i in 1:n.class) bvec[i] <- mean((trans.Y[,i] - Kcmat[,i])[logic[,i]]) if (abs(sum(bvec)) < 0.001) { flag <- F } else { epsilon <- min( epsilon*2, 0.5) } } else { flag<-F # Otherwise, LP starts to find b vector # reformulate LP w/o equality constraint and redudancy # objective function with (b_j)_+,-(b_j)_, j=1,...,(k-1) and \xi_ij a <- c(rep(0, 2*(n.class-1)), rep(1,n.data*(n.class-1))) # inequality conditions B1 <- -diag(1,n.class-1) %x% rep(1, n.data) B2 <- matrix(1,n.data,n.class-1) A <- cbind(B1, -B1) A <- rbind(A, cbind(B2,-B2)) A <- A[nonzeroIndex,] # reduced.A A <- cbind(A, diag(1, n.data*(n.class-1))) b <- matrix(Kcmat - trans.Y, ncol=1) b <- b[nonzeroIndex] # reduced.b # constraint directions const.dir <- matrix(rep(">=", nrow(A))) bpos <- lp("min", objective.in=a, const.mat=A, const.dir=const.dir, const.rhs=b)$solution[1:(2*(n.class-1))] bvec <- cbind(diag(1,n.class-1), -diag(1,n.class-1))%*%matrix(bpos, ncol=1) bvec <- c(bvec,-sum(bvec)) } } # compute the fitted values fit <- (matrix(rep(bvec, n.data), ncol=n.class, byrow=T) + Kcmat) # return the output list(cmat = cmat, bvec = bvec, fit = fit) } find.nonzero <- function(Amat) { nr <- nrow(Amat) nc <- ncol(Amat) Amat.compact <- matrix(0, nr, nc) Aind <- matrix(0, nr+1, nc) for (j in 1:nc) { index <- (1:nr)[Amat[,j] != 0] number <- length(index) Amat.compact[1:number,j] <- Amat[index,j] Aind[1,j] <- number Aind[2:(number+1),j] <- index } max.number <- max(Aind[1,]) Amat.compact <- Amat.compact[1:max.number,] Aind <- Aind[1:(max.number+1),] list(Amat.compact = Amat.compact, Aind = Aind) } predict.smsvm <- function(x, x.new, kernel, model, theta) { x <- as.matrix(x) if (!is.matrix(x.new)) # degenerate case: x.new is a row vector { x.new <- t(as.matrix(x.new)) } else { x.new <- as.matrix(x.new) } cmat <- as.matrix(model$cmat) bvec <- as.matrix(model$bvec) theta <- as.matrix(theta) # create a kernel matrix anova.kernel.new <- make.anovaKernel(x.new, x, kernel) K <- combine.kernel(anova.kernel.new, theta) # compute SMSVM predicted values for new instances fit <-(matrix(rep(bvec, nrow(x.new)), ncol=ncol(cmat), byrow=T) +(K%*%cmat)) return(fit) } predict.msvm.compact <- function(K, model) { cmat <- as.matrix(model$cmat) bvec <- as.matrix(model$bvec) n.data <- nrow(K) n.class <- ncol(cmat) fit <- (matrix(rep(bvec, n.data), ncol=n.class, byrow=T) + (K %*% cmat)) return(fit) } hinge <- function(y, fit) { n.data <- length(y) n.class <- ncol(fit) Y <- class.code(y, n.class) # L: misclassification cost matrix # (the same cost for different types of mistakes) L <- Y L[Y == 1] <- 0 L[Y != 1] <- 1 residual <- (fit - Y) residual[residual <= 0] <- 0 hin <- (sum(L * residual)/n.data) return(hin) } error.rate <- function(y, fit) { class.pred <- apply(fit,1,which.max) return(sum(class.pred != y)/length(class.pred)) } class.code <- function(y, k=max(y)) { # k: the number of classes classcodes <- diag(rep(k/(k-1),k)) - 1/(k-1)*matrix(1,k,k) if (length(y)==1) # degenerate case { Y <- t(as.matrix(classcodes[y,])) } else { Y <- as.matrix(classcodes[y,]) } return(Y) } msvm <- function(x, y, lambda, kernel.type, kernel.par= 1, criterion = "hinge", x.test = NULL, y.test = NULL, cv = FALSE, fold = 5, epsilon = 1e-3, epsilon.H = 1e-5) { if((criterion != "0-1") && (criterion != "hinge")) { cat("ERROR: Only 0-1 and hinge can be used as criterion!", "\n") return(NULL) } len.lambda <- length(lambda) ERR <- matrix(0, len.lambda, 1) HIN <- matrix(0, len.lambda, 1) kernel <- list(type=kernel.type, par=kernel.par) x <- as.matrix(x) K <- eval.kernel(x, x, kernel) if (cv & !is.null(y.test)) { cat('When cv=TRUE, the test data are not used in cross-validation.','\n') } if (cv) # cross-validation { ran <- data.split(y, fold) for(i.cv in 1:fold ) { cat("Leaving subset[", i.cv,"] out in",fold,"fold CV:","\n") omit <- (ran == i.cv) x.train <- x[!omit,] y.train <- y[!omit] x.test <- x[omit,] y.test <- y[omit] row.index <- 0 subK <- eval.kernel(x.train, x.train, kernel) subK.test <- eval.kernel(x.test, x.train, kernel) cat("lambda of length",len.lambda,"|") for(i in lambda) { row.index <- row.index + 1 exp2.lambda <- 2^i model <- msvm.compact(subK, y.train, exp2.lambda, epsilon, epsilon.H) fit.test <- predict.msvm.compact(subK.test, model) ERR[row.index] <- (ERR[row.index]+error.rate(y.test, fit.test)/fold) HIN[row.index] <- (HIN[row.index]+hinge(y.test, fit.test)/fold) cat('*') } cat('|\n') } cat("The minimum of average", fold, "fold cross-validated", criterion, "loss:","\n") } else if ( !cv & is.null(y.test) ) # in-sample evaluation { cat("lambda of length",len.lambda,"|") row.index <- 0 for(i in lambda) { row.index <- row.index + 1 exp2.lambda <- 2^i model <- msvm.compact(K, y, exp2.lambda, epsilon, epsilon.H) ERR[row.index] <- error.rate(y, model$fit) HIN[row.index] <- hinge(y, model$fit) cat('*') } cat('|\n') cat("The minimum of average in-sample", criterion,"loss:", "\n") } else # use the test data for tuning { x.test <- as.matrix(x.test) K.test <- eval.kernel(x.test, x, kernel) cat("lambda of length",len.lambda,"|") row.index <- 0 for(i in lambda) { row.index <- row.index + 1 exp2.lambda <- 2^i model <- msvm.compact(K, y, exp2.lambda, epsilon, epsilon.H) fit.test <- predict.msvm.compact(K.test, model) ERR[row.index] <- error.rate(y.test, fit.test) HIN[row.index] <- hinge(y.test, fit.test) cat('*') } cat('|\n') cat("The minimum of average", criterion,"loss over test cases:", "\n") } # choose the optimal index for lambda # if the optimal values are not unique, choose the largest value # assuming that lambda is in increasing order. if(criterion == "0-1") { optIndex <- (len.lambda:1)[which.min(ERR[len.lambda:1])] cat(min(ERR),"\n") } else if(criterion == "hinge") { optIndex <- (len.lambda:1)[which.min(HIN[len.lambda:1])] cat(min(HIN),"\n") } # choose the best model opt.lambda <- lambda[optIndex] cat("The optimal lambda on log2 scale:", opt.lambda,"\n") opt.model <- msvm.compact(K, y, 2^opt.lambda, epsilon, epsilon.H) list(opt.lambda = opt.lambda, error = ERR, hinge = HIN, model = opt.model) } eval.kernel <- function(x, u=x, kernel) { # x: m by d data matrix # u: n by d data matrix # compute m by n kernel matrix if (!is.matrix(x)) # degenerate case: x is a row vector { x <- t(as.matrix(x))} else { x <- as.matrix(x)} u <- as.matrix(u) if ( any(kernel$type == c('linear','poly','rbf')) ) { K <- main.kernel(x,u, kernel) } else if (any(kernel$type == c('spline','spline-t'))) { dimx <- ncol(x) K <- 0 for (d in 1:dimx) { K.temp <- spline.kernel(as.matrix(x[,d]), as.matrix(u[,d])) K <- K + K.temp$K1 + K.temp$K2 } } else if (any(kernel$type == c('spline2','spline-t2'))) { dimx <- ncol(x) K <- 0 anova.kernel <- vector(mode="list", dimx) # main effects for(d in 1:dimx) { K.temp <- spline.kernel(as.matrix(x[,d]), as.matrix(u[,d])) anova.kernel[[d]] <- K.temp$K1 + K.temp$K2 K <- K + anova.kernel[[d]] } # two-way interactions for (i in 1:(dimx-1)) { for (j in (i+1):dimx) { K <- K + anova.kernel[[i]]*anova.kernel[[j]] } } } else {cat('The specified kernel type is not supported.' ,'\n')} return(K) } predict.msvm <-function(x, x.new, kernel, model) { x <- as.matrix(x) if (!is.matrix(x.new)) # degenerate case: x.new is a row vector { x.new <- t(as.matrix(x.new)) } else { x.new <- as.matrix(x.new) } cmat <- as.matrix(model$cmat) bvec <- as.matrix(model$bvec) K <- eval.kernel(x.new, x, kernel) fit <- (matrix(rep(bvec, nrow(x.new)), ncol=ncol(cmat), byrow=T) + (K %*% cmat)) return(fit) } data.split <- function(y, fold, k = max(y), seed = length(y)) { # k: the number of classes n.data <- length(y) class.size <- table(y) ran <- rep(0, n.data) if ( (min(class.size) < fold) & (fold != n.data) ) { warning(' The given fold is bigger than the smallest class size. \n Only a fold size smaller than the minimum class size \n or the same as the sample size (LOOCV) is supported.\n') return(NULL) } if ( min(class.size)>= fold ) { set.seed(seed) for (j in 1:k) { ran[y==j] <- ceiling(sample(class.size[j])/(class.size[j]+1)*fold) } } else if ( fold == n.data) { ran <- 1:n.data } return(ran) } msvm.plot <- function(x, y, kernel, model, theta = NULL) { n.grid <- 51 r1 <- range(x[,1]) r2 <- range(x[,2]) u <- seq(r1[1]-.05*(r1[2]-r1[1]),r1[2]+.05*(r1[2]-r1[1]), length = n.grid) v <- seq(r2[1]-.05*(r2[2]-r2[1]),r2[2]+.05*(r2[2]-r2[1]), length = n.grid) x.new <- cbind(rep(u, rep(n.grid, n.grid)), rep(v, n.grid)) if (is.null(theta)) # ordinary MSVM { fit.new <- predict.msvm(x, x.new, kernel, model) } else { fit.new <- predict.smsvm(x, x.new, kernel, model, theta) } fit.new.class <- apply(fit.new,1,which.max) plot(x, type="n", xlab=expression(x[1]), ylab=expression(x[2]),frame.plot=T) for (j in 1:max(y)) { points(x[(y == j),], pch=1, col=j+1)} for (j in 1:max(y)) { points(x.new[(fit.new.class == j),], pch='.', col=j+1) } # delineate the boundary for (j in 1:max(y)) { fjcontrast <- fit.new[,j] - apply(as.matrix(fit.new[,-j]),1, max) fjcontrast <- t(matrix(fjcontrast, n.grid, n.grid)) contour(u, v, fjcontrast, add=T, levels=0, labex=0, lty=1, drawlabels=FALSE) } } theta.plot <- function(lambda.theta, theta.seq) { plot(range(lambda.theta), y=c(0,1), type="n", xlab=expression(log[2](lambda[theta])), ylab=expression(theta), frame.plot=T) for (i in 1:ncol(theta.seq)) { lines(lambda.theta, theta.seq[,i],lty=1,col=i+1) points(lambda.theta, theta.seq[,i],pch=1,col=i+1) } } twod.data <- function(n.data) { x <- runif(n.data) p1 <- (0.97 * exp(-3*x)) p3 <- exp(-2.5 * (x-1.2)^2) p2 <- (1- p1 - p3) # class.prob is a matrix of probabilities. # each column contains the probability that y is in each class for fixed x class.prob <- cbind(p1, p2, p3) # generate x and y x <- as.matrix(x) u <- runif(n.data) y <- rep(0, n.data) y[u <= p1] <- 1 y[u > p1 & u < (p1 + p2)] <- 2 y[u >= (p1 + p2)] <- 3 x2 <- runif(n.data) x <- cbind(x, x2) return(list(x=x,y=y,p=class.prob)) } twod.k4 <- function(n.data) { y <- rmultinom(n.data, 1, rep(.25,4)) y <- apply(y, 2, which.max) # class 1 x1.mean <- c(1, 0, -3, -2) x2.mean <- c(-2, 2, 1, -2.5) x1.sd <- c(1, 1, 1, 1) x2.sd <- c(1, 1, 1, 1) x <- matrix(0, n.data, 2) for ( j in 1:4 ) { j.class <- sum(y==j) x[y == j, 1] <- rnorm(j.class, x1.mean[j], x1.sd[j]) x[y == j, 2] <- rnorm(j.class, x2.mean[j], x2.sd[j]) } return(list(x=x,y=y)) }