How can I import SAS format files into R?

前端 未结 2 1374
余生分开走
余生分开走 2020-12-30 17:38

I am trying to analyze data from the 2012-2013 NATS survey, from this location. There are three files in the zip folder there, labelled 2012-2013 NATS format.sas, formats.sa

2条回答
  •  攒了一身酷
    2020-12-30 17:55

    This should be a one-liner:

    library('haven')
    sas <- read_sas('nats2012.sas7bdat', 'formats.sas7bcat')
    
    with(sas, table(SMOKSTATUS_R, RACEETHNIC))
    #             RACEETHNIC
    # SMOKSTATUS_R     1     2     3     4     5     6     7     8     9
    #            1  4045   455    55     7    63     0   675   393   373
    #            2  1183   222    38     2    26     0   217   255   154
    #            3 14480   957   238    14    95     3  1112   950   369
    #            4 23923  2532  1157    23   147     1  1755  3223   909
    #            5    81    18     4     0     1     0    11    17     9
    
    table(names(attr(sas[, 'SMOKSTATUS_R'], 'labels')[sas[, 'SMOKSTATUS_R']]),
          names(attr(sas[, 'RACEETHNIC'], 'labels')[sas[, 'RACEETHNIC']]))
    
    #                          Amer. Indian, AK Nat. Only, Non-Hispanic
    # Current everyday smoker                                        63
    # Current some days smoker                                       26
    # Former smoker                                                  95
    # Never smoker                                                  147
    # Unknown                                                         1
    

    Use haven to read in the data, but that also gives you some useful attributes, namely the variable labels:

    attributes(sas$SMOKSTATUS_R)
    # $label
    # [1] "SMOKER STATUS (4-level)"
    # 
    # $class
    # [1] "labelled"
    # 
    # $labels
    # Current everyday smoker Current some days smoker            Former smoker 
    #                       1                        2                        3 
    # Never smoker                  Unknown 
    #            4                        5 
    # 
    # $is_na
    # [1] FALSE FALSE FALSE FALSE FALSE
    

    You can easily write this into a function to use more generally:

    do_fmt <- function(x, fmt) {
      lbl <- if (!missing(fmt))
        unlist(unname(fmt)) else attr(x, 'labels')
    
      if (!is.null(lbl))
        tryCatch(names(lbl[match(unlist(x), lbl)]),
                 error = function(e) {
                   message(sprintf('formatting failed for %s', attr(x, 'label')),
                           domain = NA)
                   x
                 }) else x
    }
    
    table(do_fmt(sas[, 'SMOKSTATUS_R']),
          do_fmt(sas[, 'RACEETHNIC']))
    
    #                          Amer. Indian, AK Nat. Only, Non-Hispanic
    # Current everyday smoker                                        63
    # Current some days smoker                                       26
    # Former smoker                                                  95
    # Never smoker                                                  147
    # Unknown                                                         1
    

    And apply to the entire data set

    sas[] <- lapply(sas, do_fmt)
    sas$SMOKSTATUS_R[1:4]
    # [1] "Never smoker"  "Former smoker" "Former smoker" "Never smoker" 
    

    Although sometimes this fails like below. This looks like something wrong with the haven package

    attr(sas$SMOKTYPE, 'labels')
    # INAPPLICABLE            REFUSED                 DK    NOT ASCERTAINED 
    #     -4.00000           -0.62500           -0.50000           -0.46875 
    # PREMADE CIGARETTES      ROLL-YOUR-OWN               BOTH 
    #            1.00000            2.00000            3.00000 
    

    So instead of this, you can parse the format.sas file with some simple regexes

    locf <- function(x) {
      x <- data.frame(x, stringsAsFactors = FALSE)
      x[x == ''] <- NA
      indx <- !is.na(x)
    
      x[] <- lapply(seq_along(x), function(ii) {
        idx <- cumsum(indx[, ii])
        idx[idx == 0] <- NA
        x[, ii][indx[, ii]][idx]
      })
      x[, 1]
    }
    
    fmt <- readLines('~/desktop/2012-2013-NATS-Format/2012-2013-NATS-Format.sas')
    ## not sure if comments are allowed in the value definitions, but
    ## this will check for those in case
    fmt <- gsub('\\*.*;|\\/\\*.*\\*\\/', '', fmt)
    
    vars <- gsub('(?i)value\\W+(\\w*)|.', '\\1', fmt, perl = TRUE)
    vars <- locf(vars)
    
    regex <- '[\'\"].*[\'\"]|[\\w\\d-]+'
    vals <- gsub(sprintf('(?i)\\s*(%s)\\s*(=)\\s*(%s)|.', regex, regex),
                   '\\1\\2\\3', fmt, perl = TRUE)
    
    View(dd <- na.omit(data.frame(values = vars, formats = vals,
                                  stringsAsFactors = FALSE)))
    
    sp <- split(dd$formats, dd$values)
    sp <- lapply(sp, function(x) {
      x <- Filter(nzchar, x)
      x <- strsplit(x, '=')
      tw <- function(x) gsub('^\\s+|\\s+$', '', x)
      sapply(x, function(y)
        setNames(tw(y[1]), tw(y[2])))
    })
    

    So the smoke type formats (one of them that failed above), for example, gets parsed like this:

    sp['A5_']
    # $A5_
    # 'INAPPLICABLE'            'REFUSED'                 'DK' 
    #           "-1"                 "-7"                 "-8" 
    # 'NOT ASCERTAINED' 'PREMADE CIGARETTES'      'ROLL-YOUR-OWN'  'BOTH' 
    #              "-9"                  "1"                  "2"     "3" 
    

    And then you can use the function again to apply to the data

    table(do_fmt(sas['SMOKTYPE'], sp['A5_']))
    
    # 'BOTH'                 'DK'       'INAPPLICABLE' 
    #   736                   17                51857 
    # 'PREMADE CIGARETTES'            'REFUSED'      'ROLL-YOUR-OWN' 
    #                 7184                    2                  396 
    

提交回复
热议问题