Difference between subset and filter from dplyr

前端 未结 6 1594
Happy的楠姐
Happy的楠姐 2020-12-14 06:03

It seems to me that subset and filter (from dplyr) are having the same result. But my question is: is there at some point a potential difference, for ex. speed, data sizes i

6条回答
  •  萌比男神i
    2020-12-14 06:42

    In the main use cases they behave the same :

    library(dplyr)
    identical(
      filter(starwars, species == "Wookiee"),
      subset(starwars, species == "Wookiee"))
    # [1] TRUE
    

    But they have a quite a few differences, including (I was as exhaustive as possible but might have missed some) :

    • subset can be used on matrices
    • filter can be used on databases
    • filter drops row names
    • subset drop attributes other than class, names and row names.
    • subset has a select argument
    • subset recycles its condition argument
    • filter supports conditions as separate arguments
    • filter supports the .data pronoun
    • filter supports some rlang features
    • filter supports grouping
    • filter supports n() and row_number()
    • filter is stricter
    • filter is a bit faster when it counts
    • subset has methods in other packages

    subset can be used on matrices

    subset(state.x77, state.x77[,"Population"] < 400)
    #         Population Income Illiteracy Life Exp Murder HS Grad Frost   Area
    # Alaska         365   6315        1.5    69.31   11.3    66.7   152 566432
    # Wyoming        376   4566        0.6    70.29    6.9    62.9   173  97203
    

    Though columns can't be used directly as variables in the subset argument

    subset(state.x77, Population < 400)
    

    Error in subset.matrix(state.x77, Population < 400) : object 'Population' not found

    Neither works with filter

    filter(state.x77, state.x77[,"Population"] < 400)
    

    Error in UseMethod("filter_") : no applicable method for 'filter_' applied to an object of class "c('matrix', 'double', 'numeric')"

    filter(state.x77, Population < 400)
    

    Error in UseMethod("filter_") : no applicable method for 'filter_' applied to an object of class "c('matrix', 'double', 'numeric')"

    filter can be used on databases

    library(DBI)
    con <- dbConnect(RSQLite::SQLite(), ":memory:")
    dbWriteTable(con, "mtcars", mtcars)
    tbl(con,"mtcars") %>% 
      filter(hp < 65)
    
    # # Source:   lazy query [?? x 11]
    # # Database: sqlite 3.19.3 [:memory:]
    #       mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
    #               
    #   1  24.4     4 146.7    62  3.69 3.190 20.00     1     0     4     2
    #   2  30.4     4  75.7    52  4.93 1.615 18.52     1     1     4     2
    

    subset can't

    tbl(con,"mtcars") %>% 
      subset(hp < 65)
    

    Error in subset.default(., hp < 65) : object 'hp' not found

    filter drops row names

    filter(mtcars, hp < 65)
    #    mpg cyl  disp hp drat    wt  qsec vs am gear carb
    # 1 24.4   4 146.7 62 3.69 3.190 20.00  1  0    4    2
    # 2 30.4   4  75.7 52 4.93 1.615 18.52  1  1    4    2
    

    subset doesn't

    subset(mtcars, hp < 65)
    #              mpg cyl  disp hp drat    wt  qsec vs am gear carb
    # Merc 240D   24.4   4 146.7 62 3.69 3.190 20.00  1  0    4    2
    # Honda Civic 30.4   4  75.7 52 4.93 1.615 18.52  1  1    4    2
    

    subset drop attributes other than class, names and row names.

    cars_head <- head(cars)
    attr(cars_head, "info") <- "head of cars dataset"
    attributes(subset(cars_head, speed > 0))
    #> $names
    #> [1] "speed" "dist" 
    #> 
    #> $row.names
    #> [1] 1 2 3 4 5 6
    #> 
    #> $class
    #> [1] "data.frame"
    
    attributes(filter(cars_head, speed > 0))
    #> $names
    #> [1] "speed" "dist" 
    #> 
    #> $row.names
    #> [1] 1 2 3 4 5 6
    #> 
    #> $class
    #> [1] "data.frame"
    #> 
    #> $info
    #> [1] "head of cars dataset"
    

    subset has a select argument

    While dplyr follows tidyverse principles which aim at having each function doing one thing, so select is a separate function.

    identical(
    subset(starwars, species == "Wookiee", select = c("name", "height")),
    filter(starwars, species == "Wookiee") %>% select(name, height)
    )
    # [1] TRUE
    

    It also has a drop argument, that makes mostly sense in the context of using the select argument.

    subset recycles its condition argument

    half_iris <- subset(iris,c(TRUE,FALSE))
    dim(iris) # [1] 150   5
    dim(half_iris) # [1] 75  5
    

    filter doesn't

    half_iris <- filter(iris,c(TRUE,FALSE))
    

    Error in filter_impl(.data, quo) : Result must have length 150, not 2

    filter supports conditions as separate arguments

    Conditions are fed to ... so we can have several conditions as different arguments, which is the same as using & but might be more readable sometimes due to logical operator precedence and automatic identation.

    identical(
      subset(starwars, 
             (species == "Wookiee" | eye_color == "blue") &
               mass > 120),
      filter(starwars, 
             species == "Wookiee" | eye_color == "blue", 
             mass > 120)
    )
    

    filter supports the use use of the .data pronoun

    mtcars %>% filter(.data[["hp"]] < 65)
    
    #    mpg cyl  disp hp drat    wt  qsec vs am gear carb
    # 1 24.4   4 146.7 62 3.69 3.190 20.00  1  0    4    2
    # 2 30.4   4  75.7 52 4.93 1.615 18.52  1  1    4    2
    

    filter supports some rlang features

    x <- "hp"
    library(rlang)
    mtcars %>% filter(!!sym(x) < 65)
    # m   pg cyl  disp hp drat    wt  qsec vs am gear carb
    # 1 24.4   4 146.7 62 3.69 3.190 20.00  1  0    4    2
    # 2 30.4   4  75.7 52 4.93 1.615 18.52  1  1    4    2
    
    
    filter65 <- function(data,var){
      data %>% filter(!!enquo(var) < 65)
    }
    mtcars %>% filter65(hp)
    #    mpg cyl  disp hp drat    wt  qsec vs am gear carb
    # 1 24.4   4 146.7 62 3.69 3.190 20.00  1  0    4    2
    # 2 30.4   4  75.7 52 4.93 1.615 18.52  1  1    4    2
    

    filter supports grouping

    iris %>%
      group_by(Species) %>%
      filter(Petal.Length < quantile(Petal.Length,0.01))
    
    # # A tibble: 3 x 5
    # # Groups:   Species [3]
    #   Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
    #                                     
    # 1          4.6         3.6          1.0         0.2     setosa
    # 2          5.1         2.5          3.0         1.1 versicolor
    # 3          4.9         2.5          4.5         1.7  virginica
    
    iris %>%
      group_by(Species) %>%
      subset(Petal.Length < quantile(Petal.Length,0.01))
    
    # # A tibble: 2 x 5
    # # Groups:   Species [1]
    #     Sepal.Length Sepal.Width Petal.Length Petal.Width Species
    #                                    
    #   1          4.3         3.0          1.1         0.1  setosa
    #   2          4.6         3.6          1.0         0.2  setosa
    

    filter supports n() and row_number()

    filter(iris, row_number() < n()/30)
    # Sepal.Length Sepal.Width Petal.Length Petal.Width Species
    # 1          5.1         3.5          1.4         0.2  setosa
    # 2          4.9         3.0          1.4         0.2  setosa
    # 3          4.7         3.2          1.3         0.2  setosa
    # 4          4.6         3.1          1.5         0.2  setosa
    

    filter is stricter

    It trigger errors if the input is suspicious.

    filter(iris, Species = "setosa")
    # Error: `Species` (`Species = "setosa"`) must not be named, do you need `==`?
    
    identical(subset(iris, Species = "setosa"), iris)
    # [1] TRUE
    
    df1 <- setNames(data.frame(a = 1:3, b=5:7),c("a","a"))
    # df1
    # a a
    # 1 1 5
    # 2 2 6
    # 3 3 7
    
    filter(df1, a > 2)
    #Error: Column `a` must have a unique name
    subset(df1, a > 2)
    # a a.1
    # 3 3   7
    

    filter is a bit faster when it counts

    Borrowing the dataset that Benjamin built in his answer (153 k rows), it's twice faster, though it should rarely be a bottleneck.

    air <- lapply(1:1000, function(x) airquality) %>% bind_rows
    microbenchmark::microbenchmark(
      subset = subset(air, Temp>80 & Month > 5),
      filter = filter(air, Temp>80 & Month > 5)
    )
    
    # Unit: milliseconds
    #   expr      min        lq      mean    median        uq      max neval cld
    # subset 8.771962 11.551255 19.942501 12.576245 13.933290 108.0552   100   b
    # filter 4.144336  4.686189  8.024461  6.424492  7.499894 101.7827   100  a 
    

    subset has methods in other packages

    subset is an S3 generic, just as dplyr::filter is, but subset as a base function is more likely to have methods developed in other packages, one prominent example is zoo:::subset.zoo.

提交回复
热议问题