Web Scraping a tableauViz into an R dataframe

前端 未结 1 948
野趣味
野趣味 2020-12-11 21:11

I have spent a lot of time searching for an answer to this, but have not found anything yet. What I am trying to accomplish is to scrape Tableau table information that is co

相关标签:
1条回答
  • 2020-12-11 22:03

    In order to get the data, you need the tableau URL which is in this case :

    https://public.tableau.com/views/S07StuP58/Dashboard1?:embed=y&:showVizHome=no

    The flow is the following :

    • call the following url :

      GET https://public.tableau.com/views/S07StuP58/Dashboard1?:embed=y&:showVizHome=no
      
    • extract the JSON content from the textarea with id tsConfigContainer

    • build the url with the session_id

      POST https://public.tableau.com/{vizql_path}/bootstrapSession/sessions/{session_id}
      
    • extract the JSON data from the response which is not JSON originally (regex to split the data)

    • extract the data from the large JSON configuration, this is not straightforward since all the strings data are located in a single array. You need to get the data indices from various fields in order to be able to split the data into columns and then build your dataframe

    The following code will prompt the user to choose a worksheet (by index), parse the data and put it in a dataframe :

    library(rvest)
    library(rjson)
    library(httr)
    library(stringr)
    
    #replace the hostname and the path if necessary
    host_url <- "https://public.tableau.com"
    path <- "/views/S07StuP58/Dashboard1"
    
    body <- read_html(modify_url(host_url, 
                                 path = path, 
                                 query = list(":embed" = "y",":showVizHome" = "no")
    ))
    
    data <- body %>% 
      html_nodes("textarea#tsConfigContainer") %>% 
      html_text()
    json <- fromJSON(data)
    
    url <- modify_url(host_url, path = paste(json$vizql_root, "/bootstrapSession/sessions/", json$sessionid, sep =""))
    
    resp <- POST(url, body = list(sheet_id = json$sheetId), encode = "form")
    data <- content(resp, "text")
    
    extract <- str_match(data, "\\d+;(\\{.*\\})\\d+;(\\{.*\\})")
    info <- fromJSON(extract[1,1])
    data <- fromJSON(extract[1,3])
    
    worksheets = names(data$secondaryInfo$presModelMap$vizData$presModelHolder$genPresModelMapPresModel$presModelMap)
    
    for(i in 1:length(worksheets)){
      print(paste("[",i,"] ",worksheets[i], sep=""))
    }
    selected <-  readline(prompt="select worksheet by index: ");
    worksheet <- worksheets[as.integer(selected)]
    print(paste("you selected :", worksheet, sep=" "))
    
    columnsData <- data$secondaryInfo$presModelMap$vizData$presModelHolder$genPresModelMapPresModel$presModelMap[[worksheet]]$presModelHolder$genVizDataPresModel$paneColumnsData
    
    i <- 1
    result <- list();
    for(t in columnsData$vizDataColumns){
      if (is.null(t[["fieldCaption"]]) == FALSE) {
        paneIndex <- t$paneIndices
        columnIndex <- t$columnIndices
        if (length(t$paneIndices) > 1){
          paneIndex <- t$paneIndices[1]
        }
        if (length(t$columnIndices) > 1){
          columnIndex <- t$columnIndices[1]
        }
        result[[i]] <- list(
          fieldCaption = t[["fieldCaption"]], 
          valueIndices = columnsData$paneColumnsList[[paneIndex + 1]]$vizPaneColumns[[columnIndex + 1]]$valueIndices,
          aliasIndices = columnsData$paneColumnsList[[paneIndex + 1]]$vizPaneColumns[[columnIndex + 1]]$aliasIndices, 
          dataType = t[["dataType"]],
          stringsAsFactors = FALSE
        )
        i <- i + 1
      }
    }
    dataFull = data$secondaryInfo$presModelMap$dataDictionary$presModelHolder$genDataDictionaryPresModel$dataSegments[["0"]]$dataColumns
    
    cstring <- list();
    for(t in dataFull) {
      if(t$dataType == "cstring"){
        cstring <- t
        break
      }
    }
    data_index <- 1
    name_index <- 1
    frameData <-  list()
    frameNames <- c()
    for(t in dataFull) {
      for(index in result) {
        if (t$dataType == index["dataType"]){
          if (length(index$valueIndices) > 0) {
            j <- 1
            vector <- character(length(index$valueIndices))
            for (it in index$valueIndices){
              vector[j] <- t$dataValues[it+1]
              j <- j + 1
            }
            frameData[[data_index]] <- vector
            frameNames[[name_index]] <- paste(index$fieldCaption, "value", sep="-")
            data_index <- data_index + 1
            name_index <- name_index + 1
          }
          if (length(index$aliasIndices) > 0) {
            j <- 1
            vector <- character(length(index$aliasIndices))
            for (it in index$aliasIndices){
              if (it >= 0){
                vector[j] <- t$dataValues[it+1]
              } else {
                vector[j] <- cstring$dataValues[abs(it)]
              }
              j <- j + 1
            }
            frameData[[data_index]] <- vector
            frameNames[[name_index]] <- paste(index$fieldCaption, "alias", sep="-")
            data_index <- data_index + 1
            name_index <- name_index + 1
          }
        }
      }
    }
    
    df <- NULL
    lengthList <- c()
    for(i in 1:length(frameNames)){
      lengthList[i] <- length(frameData[[i]])
    }
    max <- max(lengthList)
    for(i in 1:length(frameNames)){
      if (length(frameData[[i]]) < max){
        len <- length(frameData[[i]])
        frameData[[i]][(len+1):max]<-""
      }
      df[frameNames[[i]]] <- frameData[i]
    }
    options(width = 1200)
    df <- as.data.frame(df, stringsAsFactors = FALSE)
    print(df)
    

    which gives the following output :

                   X.Student.Aid.Program.Type..value               X.Student.Aid.Program..value        X..Measure.Names..alias X.Multiple.Values..alias
    1                            Grants/Scholarships                                   Subtotal        Graduate Amount Awarded             $XXXXXXXX
    2                            Grants/Scholarships       Other (External) Grants/Scholarships        Graduate Amount Awarded                 $XXXXXX
    3                            Grants/Scholarships Miscellaneous Tuition/Fee Payments/Waivers        Graduate Amount Awarded              $XXXXXX
    4                            Grants/Scholarships     Graduate Assistantship Tuition Waivers        Graduate Amount Awarded              $XXXXXX
    5                            Grants/Scholarships                      Athletic Scholarships        Graduate Amount Awarded                 $XXXXXX
    6                            Grants/Scholarships          Institutional Grants/Scholarships        Graduate Amount Awarded               $XXXXXX
    7                            Grants/Scholarships          State (Other) Grants/Scholarships        Graduate Amount Awarded                   $XXXXXX
    8                            Grants/Scholarships                   Zell Miller Scholarships        Graduate Amount Awarded               $XXXXXX
    9                            Grants/Scholarships                          HOPE Scholarships        Graduate Amount Awarded                 $XXXXXX
    10                           Grants/Scholarships        Federal (Other) Grants/Scholarships        Graduate Amount Awarded                         
    11                           Grants/Scholarships                Federal Supplemental Grants        Graduate Amount Awarded                         
    12                           Grants/Scholarships                        Federal Pell Grants        Graduate Amount Awarded                         
    13                           Grants/Scholarships                                   Subtotal      Graduate Number of Awards                    XXXXXX
    14                           Grants/Scholarships       Other (External) Grants/Scholarships      Graduate Number of Awards                      XXX
    

    The python equivalent of the code above would be :

    import requests
    from bs4 import BeautifulSoup
    import json
    import re
    import pandas as pd
    
    #replace the hostname and the path if necessary
    host_url = "https://public.tableau.com"
    path = "/views/S07StuP58/Dashboard1"
    
    url = f"{host_url}{path}"
    
    r = requests.get(
        url,
        params= {
            ":embed": "y",
            ":showVizHome": "no"
        }
    ) 
    soup = BeautifulSoup(r.text, "html.parser")
    
    tableauData = json.loads(soup.find("textarea",{"id": "tsConfigContainer"}).text)
    
    dataUrl = f'{host_url}{tableauData["vizql_root"]}/bootstrapSession/sessions/{tableauData["sessionid"]}'
    
    r = requests.post(dataUrl, data= {
        "sheet_id": tableauData["sheetId"],
    })
    
    dataReg = re.search('\d+;({.*})\d+;({.*})', r.text, re.MULTILINE)
    info = json.loads(dataReg.group(1))
    data = json.loads(dataReg.group(2))
    
    worksheets = list(data["secondaryInfo"]["presModelMap"]["vizData"]["presModelHolder"]["genPresModelMapPresModel"]["presModelMap"].keys())
    
    for idx, ws in enumerate(worksheets):
        print(f"[{idx}] {ws}")
    
    selected = input("select worksheet by index: ")
    worksheet = worksheets[int(selected)]
    print(f"you selected : {worksheet}")
    
    columnsData = data["secondaryInfo"]["presModelMap"]["vizData"]["presModelHolder"]["genPresModelMapPresModel"]["presModelMap"][worksheet]["presModelHolder"]["genVizDataPresModel"]["paneColumnsData"]
    result = [ 
        {
            "fieldCaption": t.get("fieldCaption", ""), 
            "valueIndices": columnsData["paneColumnsList"][t["paneIndices"][0]]["vizPaneColumns"][t["columnIndices"][0]]["valueIndices"],
            "aliasIndices": columnsData["paneColumnsList"][t["paneIndices"][0]]["vizPaneColumns"][t["columnIndices"][0]]["aliasIndices"],
            "dataType": t.get("dataType"),
            "paneIndices": t["paneIndices"][0],
            "columnIndices": t["columnIndices"][0]
        }
        for t in columnsData["vizDataColumns"]
        if t.get("fieldCaption")
    ]
    dataFull = data["secondaryInfo"]["presModelMap"]["dataDictionary"]["presModelHolder"]["genDataDictionaryPresModel"]["dataSegments"]["0"]["dataColumns"]
    
    def onAlias(it, value, cstring):
        return value[it] if (it >= 0) else cstring["dataValues"][abs(it)-1]
    
    frameData = {}
    cstring = [t for t in dataFull if t["dataType"] == "cstring"][0]
    for t in dataFull:
        for index in result:
            if (t["dataType"] == index["dataType"]):
                if len(index["valueIndices"]) > 0:
                    frameData[f'{index["fieldCaption"]}-value'] = [t["dataValues"][abs(it)] for it in index["valueIndices"]]
                if len(index["aliasIndices"]) > 0:
                    frameData[f'{index["fieldCaption"]}-alias'] = [onAlias(it, t["dataValues"], cstring) for it in index["aliasIndices"]]
    
    df = pd.DataFrame.from_dict(frameData, orient='index').fillna(0).T
    with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 1000):
        print(df)
    

    Try this on repl.it

    checkout this repo for both script in python and R

    0 讨论(0)
提交回复
热议问题