read/write data in libsvm format

前端 未结 7 2093
慢半拍i
慢半拍i 2020-11-30 11:05

How do I read/write libsvm data into/from R?

The libsvm format is sparse data like

[ 

        
7条回答
  •  长情又很酷
    2020-11-30 11:48

    Based on some comments. I add it as an aswer so it's easier for others to use. This is to write data in libsvm format.

    Function to write a data.frame to svm light format. I've added a train={TRUE, FALSE} argument in case the data doesn't have labels. In this case, the class index is ignored.

    write.libsvm = function(data, filename= "out.dat", class = 1, train=TRUE) {
      out = file(filename)
      if(train){
        writeLines(apply(data, 1, function(X) {
          paste(X[class], 
                apply(cbind(which(X!=0)[-class], 
                            X[which(X!=0)[-class]]), 
                      1, paste, collapse=":"), 
                collapse=" ") 
          }), out)
      } else {
        # leaves 1 as default for the new data without predictions. 
        writeLines(apply(data, 1, function(X) {
          paste('1',
                apply(cbind(which(X!=0), X[which(X!=0)]), 1, paste, collapse=":"), 
                collapse=" ") 
          }), out)
      }
      close(out) 
    }
    

    ** EDIT **

    Another option - In case you already have the data in a data.table object

    libfm and SVMlight have the same format, so this function should work.

    library(data.table)
    
    data.table.fm <- function (data = X, fileName = "../out.fm", target = "y_train", 
        train = TRUE) {
        if (train) {
            if (is.logical(data[[target]]) | sum(levels(factor(data[[target]])) == 
                levels(factor(c(0, 1)))) == 2) {
                data[[target]][data[[target]] == TRUE] = 1
                data[[target]][data[[target]] == FALSE] = -1
            }
        }
        specChar = "\\(|\\)|\\||\\:"
        specCharSpace = "\\(|\\)|\\||\\:| "
        parsingNames <- function(x) {
            ret = c()
            for (el in x) ret = append(ret, gsub(specCharSpace, "_", 
                el))
            ret
        }
        parsingVar <- function(x, keepSpace, hard_parse) {
            if (!keepSpace) 
                spch = specCharSpace
            else spch = specChar
            if (hard_parse) 
                gsub("(^_( *|_*)+)|(^_$)|(( *|_*)+_$)|( +_+ +)", 
                    " ", gsub(specChar, "_", gsub("(^ +)|( +$)", 
                      "", x)))
            else gsub(spch, "_", x)
        }
        setnames(data, names(data), parsingNames(names(data)))
        target = parsingNames(target)
        format_vw <- function(column, formater) {
            ifelse(as.logical(column), sprintf(formater, j, column), 
                "")
        }
        all_vars = names(data)[!names(data) %in% target]
        cat("Reordering data.table if class isn't first\n")
        target_inx = which(names(data) %in% target)
        rest_inx = which(!names(data) %in% target)
        cat("Adding Variable names to data.table\n")
        for (j in rest_inx) {
            column = data[[j]]
            formater = "%s:%f"
            set(data, i = NULL, j = j, value = format_vw(column, 
                formater))
            cat(sprintf("Fixing %s\n", j))
        }
        data = data[, c(target_inx, rest_inx), with = FALSE]
        drop_extra_space <- function(x) {
            gsub(" {1,}", " ", x)
        }
        cat("Pasting data - Removing extra spaces\n")
        data = apply(data, 1, function(x) drop_extra_space(paste(x, 
            collapse = " ")))
        cat("Writing to disk\n")
        write.table(data, file = fileName, sep = " ", row.names = FALSE, 
            col.names = FALSE, quote = FALSE)
    }
    

提交回复
热议问题