Finding pattern in a matrix in R

前端 未结 7 784
被撕碎了的回忆
被撕碎了的回忆 2020-12-30 04:40

I have a 8 x n matrix, for instance

set.seed(12345)
m <- matrix(sample(1:50, 800, replace=T), ncol=8)
head(m)

     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8         


        
7条回答
  •  半阙折子戏
    2020-12-30 05:20

    Here is a generalized function:

    PatternMatcher <- function(data, pattern, idx = NULL) {
      p <- unlist(pattern[1])
      if(is.null(idx)){
        p <- unlist(pattern[length(pattern)])
        PatternMatcher(data, rev(pattern)[-1], 
                       idx = Filter(function(n) all(p %in% intersect(data[n, ], p)),
                                    1:nrow(data)))
      } else if(length(pattern) > 1) {
        PatternMatcher(data, pattern[-1], 
                       idx = Filter(function(n) all(p %in% intersect(data[n, ], p)), 
                                    idx - 1))
      } else
        Filter(function(n) all(p %in% intersect(data[n, ], p)), idx - 1)
    }
    

    This is a recursive function which is reducing pattern in every iteration and checks only rows that go right after ones identified in the previous iteration. List structure allows passing the pattern in a convenient way:

    PatternMatcher(m, list(37, list(10, 29), 42))
    # [1] 57
    PatternMatcher(m, list(list(45, 24, 1), 7, list(45, 31), 4))
    # [1] 2
    PatternMatcher(m, list(1,3))
    # [1] 47 48 93
    

    Edit: The idea of the function above seems fine: check all rows for the vector pattern[[1]] and get indices r1, then check rows r1+1 for pattern[[2]] and get r2, etc. But it takes really much time at the first step when going through all rows. Of course, every step would take much time with e.g. m <- matrix(sample(1:10, 800, replace=T), ncol=8), i.e. when there is not much of a change in indices r1, r2, ... So here is another approach, here PatternMatcher looks very similar, but there is another function matchRow for finding rows that have all elements of vector.

    matchRow <- function(data, vector, idx = NULL){
      if(is.null(idx)){
        matchRow(data, vector[-1], 
                 as.numeric(unique(rownames(which(data == vector[1], arr.ind = TRUE)))))
      } else if(length(vector) > 0) {
        matchRow(data, vector[-1], 
                 as.numeric(unique(rownames(which(data[idx, , drop = FALSE] == vector[1], arr.ind = TRUE)))))
      } else idx
    }
    PatternMatcher <- function(data, pattern, idx = NULL) {
      p <- pattern[[1]]
      if(is.null(idx)){
        rownames(data) <- 1:nrow(data)
        p <- pattern[[length(pattern)]]
        PatternMatcher(data, rev(pattern)[-1], idx = matchRow(data, p))
      } else if(length(pattern) > 1) {
        PatternMatcher(data, pattern[-1], idx = matchRow(data, p, idx - 1))
      } else
        matchRow(data, p, idx - 1)
    }
    

    Comparison with the previous function:

    library(rbenchmark)
    bigM <- matrix(sample(1:50, 800000, replace=T), ncol=8)
    benchmark(PatternMatcher(bigM, list(37, c(10, 29), 42)), 
              PatternMatcher(bigM, list(1, 3)), 
              OldPatternMatcher(bigM, list(37, list(10, 29), 42)), 
              OldPatternMatcher(bigM, list(1, 3)), 
              replications = 10,
              columns = c("test", "elapsed"))
    #                                                  test elapsed
    # 4                 OldPatternMatcher(bigM, list(1, 3))   61.14
    # 3 OldPatternMatcher(bigM, list(37, list(10, 29), 42))   63.28
    # 2                    PatternMatcher(bigM, list(1, 3))    1.58
    # 1       PatternMatcher(bigM, list(37, c(10, 29), 42))    2.02
    
    verybigM1 <- matrix(sample(1:40, 8000000, replace=T), ncol=20)
    verybigM2 <- matrix(sample(1:140, 8000000, replace=T), ncol=20)
    benchmark(PatternMatcher(verybigM1, list(37, c(10, 29), 42)), 
              PatternMatcher(verybigM2, list(37, c(10, 29), 42)), 
              find.combo(verybigM1, convert.gui.input("37;10,29;42")),
              find.combo(verybigM2, convert.gui.input("37;10,29;42")),          
              replications = 20,
              columns = c("test", "elapsed"))
    #                                                      test elapsed
    # 3 find.combo(verybigM1, convert.gui.input("37;10,29;42"))   17.55
    # 4 find.combo(verybigM2, convert.gui.input("37;10,29;42"))   18.72
    # 1      PatternMatcher(verybigM1, list(37, c(10, 29), 42))   15.84
    # 2      PatternMatcher(verybigM2, list(37, c(10, 29), 42))   19.62
    

    Also now the pattern argument should be like list(37, c(10, 29), 42) instead of list(37, list(10, 29), 42). And finally:

    fastPattern <- function(data, pattern)
      PatternMatcher(data, lapply(strsplit(pattern, ";")[[1]], 
                        function(i) as.numeric(unlist(strsplit(i, split = ",")))))
    fastPattern(m, "37;10,29;42")
    # [1] 57
    fastPattern(m, "37;;42")
    # [1] 57  4
    fastPattern(m, "37;;;42")
    # [1] 33 56 77
    

提交回复
热议问题