build word co-occurence edge list in R

前端 未结 3 505
南方客
南方客 2020-12-16 06:35

I have a chunk of sentences and I want to build the undirected edge list of word co-occurrence and see the frequency of every edge. I took a look at the tm pack

相关标签:
3条回答
  • 2020-12-16 06:44

    Here's a base R way:

    d <- read.table(text='sentence_id text
    1           "a b c d e"
    2           "a b b e"
    3           "b c d"
    4           "a e"', header=TRUE, as.is=TRUE)
    
    result.vec <- table(unlist(lapply(d$text, function(text) {
        pairs <- combn(unique(scan(text=text, what='', sep=' ')), m=2)
        interaction(pairs[1,], pairs[2,])
    })))
    # a.b b.b c.b d.b a.c b.c c.c d.c a.d b.d c.d d.d a.e b.e c.e d.e 
    #   2   0   0   0   1   2   0   0   1   2   2   0   3   2   1   1 
    
    result <- subset(data.frame(do.call(rbind, strsplit(names(result.vec), '\\.')), freq=as.vector(result.vec)), freq > 0)
    with(result, result[order(X1, X2),])
    
    #    X1 X2 freq
    # 1   a  b    2
    # 5   a  c    1
    # 9   a  d    1
    # 13  a  e    3
    # 6   b  c    2
    # 10  b  d    2
    # 14  b  e    2
    # 11  c  d    2
    # 15  c  e    1
    # 16  d  e    1
    
    0 讨论(0)
  • 2020-12-16 06:59

    It's convoluted so there's got to be a better approach:

    dat <- read.csv(text="sentence_id, text
    1,           a b c d e
    2,           a b b e
    3,           b c d
    4,           a e", header=TRUE)
    
    
    library(qdapTools); library(tidyr)
    x <- t(mtabulate(with(dat, by(text, sentence_id, bag_o_words))) > 0)
    out <- x %*% t(x)
    out[upper.tri(out, diag=TRUE)] <- NA
    
    out2 <- matrix2df(out, "word1") %>%
        gather(word2, freq, -word1) %>%
        na.omit() 
    
    rownames(out2) <- NULL
    out2
    
    ##    word1 word2 freq
    ## 1      b     a    2
    ## 2      c     a    1
    ## 3      d     a    1
    ## 4      e     a    3
    ## 5      c     b    2
    ## 6      d     b    2
    ## 7      e     b    2
    ## 8      d     c    2
    ## 9      e     c    1
    ## 10     e     d    1
    

    Base only solution

    out <- lapply(with(dat, split(text, sentence_id)), function(x) {
        strsplit(gsub("^\\s+|\\s+$", "", as.character(x)), "\\s+")[[1]]
    })
    
    nms <- sort(unique(unlist(out)))
    
    out2 <- lapply(out, function(x) {
        as.data.frame(table(x), stringsAsFactors = FALSE)
    })
    
    dat2 <- data.frame(x = nms)
    
    for(i in seq_along(out2)) {
        m <- merge(dat2, out2[[i]], all.x = TRUE)
        names(m)[i + 1] <- dat[["sentence_id"]][i]
        dat2 <- m
    }
    
    dat2[is.na(dat2)] <- 0
    x <- as.matrix(dat2[, -1]) > 0
    
    out3 <- x %*% t(x)
    out3[upper.tri(out3, diag=TRUE)] <- NA
    dimnames(out3) <- list(dat2[[1]], dat2[[1]])
    
    out4 <- na.omit(data.frame( 
            word1 = rep(rownames(out3), ncol(out3)),  
            word2 = rep(colnames(out3), each = nrow(out3)),
            freq = c(unlist(out3)),
            stringsAsFactors = FALSE)
    )
    
    row.names(out4) <- NULL
    
    out4
    
    0 讨论(0)
  • 2020-12-16 07:03

    This is very closely related to @TylerRinker's answer, but using different tools.

    library(splitstackshape)
    library(reshape2)
    
    temp <- crossprod(
      as.matrix(
        cSplit_e(d, "text", " ", type = "character", 
                 fill = 0, drop = TRUE)[-1]))
    temp[upper.tri(temp, diag = TRUE)] <- NA
    melt(temp, na.rm = TRUE)
    #      Var1   Var2 value
    # 2  text_b text_a     2
    # 3  text_c text_a     1
    # 4  text_d text_a     1
    # 5  text_e text_a     3
    # 8  text_c text_b     2
    # 9  text_d text_b     2
    # 10 text_e text_b     2
    # 14 text_d text_c     2
    # 15 text_e text_c     1
    # 20 text_e text_d     1
    

    The "text_" parts of "Var1" and "Var2" can be stripped easily with sub or gsub.

    0 讨论(0)
提交回复
热议问题