How to extract all the keys in a JSON object with BigQuery

前端 未结 4 1879
甜味超标
甜味超标 2020-12-30 10:24

BigQuery has facilities to parse JSON in real-time interactive queries: Just store the JSON encoded object as a string, and query in real time, with functions like JSON_EXTR

相关标签:
4条回答
  • 2020-12-30 10:59

    This is what I came up with (Specifically for StandardSQL).. Not sure if accumulating in a list is the best method... Also.. I simplified for my case where I'm just concerned with keys.

    CREATE TEMPORARY FUNCTION Foo(infoo STRING)
    RETURNS Array<String>
    LANGUAGE js AS """
          blah = [];
    
    
          function processKey(node, parent) {
            if (parent !== '') {parent += '.'};
            Object.keys(node).forEach(function(key) {
              value = node[key].toString();
              if (value !== '[object Object]') {
                blah.push(parent+key) 
              } else {
                processKey(node[key], parent + key);
              };
            });                 
          };
    
        try {     
          x = JSON.parse(infoo);  
          processKey(x,'');
          return blah;
        } catch (e) { return null }      
    
    """
    OPTIONS ();
    WITH x as(
    select Foo(jsonfield) as bbb from clickstream.clikcs
    )
    select distinct arr_item from (SELECT arr_item FROM x, UNNEST(bbb) as arr_item) 
    
    0 讨论(0)
  • 2020-12-30 11:01

    Here's something that uses Standard SQL:

    CREATE TEMP FUNCTION jsonObjectKeys(input STRING)
    RETURNS Array<String>
    LANGUAGE js AS """
      return Object.keys(JSON.parse(input));
    """;
    WITH keys AS (
      SELECT
        jsonObjectKeys(myColumn) AS keys
      FROM
        myProject.myTable
      WHERE myColumn IS NOT NULL
    )
    SELECT
      DISTINCT k
    FROM keys
    CROSS JOIN UNNEST(keys.keys) AS k
    ORDER BY k
    
    0 讨论(0)
  • 2020-12-30 11:08

    How to extract all of a JSON object keys using a JavaScript UDF in BigQuery:

    SELECT type, key
    FROM (
      SELECT * FROM
      js(
        (SELECT json, type FROM [fh-bigquery:openlibrary.ol_dump_20151231]
        ),
        // Input columns.
        json, type,
        // Output schema.
        "[{name: 'key', type:'string'},
         {name: 'type', type:'string'}]",
         // The function.
         "function(r, emit) { 
          x=JSON.parse(r.json)
          Object.keys(x).forEach(function(entry) {
            emit({key:entry, type:r.type,});
          });     
        }"
      )
    )
    LIMIT 100
    

    Grouped and counted:

    Once you've found all the keys you can use, then you can use JSON_EXTRACT_SCALAR on a normal SQL query:

    Now that you know the keys, you can extract all information known for a type:

    SELECT JSON_EXTRACT_SCALAR(json, '$.key') key,
      JSON_EXTRACT_SCALAR(json, '$.type.key') type,
      JSON_EXTRACT(json, '$.revision') revision,
      JSON_EXTRACT_SCALAR(json, '$.last_modified.value') last_modified,
      JSON_EXTRACT_SCALAR(json, '$.title') title,
      JSON_EXTRACT_SCALAR(json, '$.publish_date') publish_date,
      JSON_EXTRACT(json, '$.publishers') publishers,
      JSON_EXTRACT(json, '$.latest_revision') latest_revision,
      JSON_EXTRACT(json, '$.languages') languages,
      JSON_EXTRACT(json, '$.authors') authors,
      JSON_EXTRACT(json, '$.works') works,
      JSON_EXTRACT(json, '$.number_of_pages') number_of_pages,
      JSON_EXTRACT(json, '$.publish_places') publish_places,
      JSON_EXTRACT(json, '$.publish_country') publish_country,
      JSON_EXTRACT(json, '$.subjects') subjects,
      JSON_EXTRACT_SCALAR(json, '$.created.value') created,
      JSON_EXTRACT_SCALAR(json, '$.pagination') pagination,
      JSON_EXTRACT_SCALAR(json, '$.by_statement') by_statement,
      JSON_EXTRACT(json, '$.isbn_10') isbn_10,
      JSON_EXTRACT_SCALAR(json, '$.isbn_10[0]') isbn_10_0,
      JSON_EXTRACT(json, '$.notes') notes,
      JSON_EXTRACT(json, '$.lc_classifications') lc_classifications,
      JSON_EXTRACT_SCALAR(json, '$.subtitle') subtitle,
      JSON_EXTRACT(json, '$.lccn') lccn,
      JSON_EXTRACT(json, '$.identifiers') identifiers,
      JSON_EXTRACT(json, '$.contributions') contributions,
      JSON_EXTRACT(json, '$.isbn_13') isbn_13,
      JSON_EXTRACT_SCALAR(json, '$.isbn_13[0]') isbn_13_0,
      JSON_EXTRACT(json, '$.physical_format') physical_format,
      JSON_EXTRACT(json, '$.oclc_numbers') oclc_numbers,
      JSON_EXTRACT(json, '$.series') series,
      JSON_EXTRACT(json, '$.source_records') source_records,
      JSON_EXTRACT(json, '$.covers') covers,
      JSON_EXTRACT(json, '$.dewey_decimal_class') dewey_decimal_class,
      JSON_EXTRACT_SCALAR(json, '$.edition_name') edition_name,
      # ...
    FROM [fh-bigquery:openlibrary.ol_dump_20151231]
    WHERE type='/type/edition'
    LIMIT 10
    

    (sample data taken from an Open Library data dump https://openlibrary.org/developers/dumps, based on a reddit conversation)

    0 讨论(0)
  • 2020-12-30 11:14

    Below version fixes some "issues" in original answer like:
    1. only first level of keys was emitted
    2. having to manually comppile and than run final query for extracting info based on discovered keys

    SELECT type, key, value, COUNT(1) AS weight 
    FROM JS(
      (SELECT json, type 
         FROM [fh-bigquery:openlibrary.ol_dump_20151231@0] 
         WHERE type = '/type/edition'
      ),
      json, type,                             // Input columns
      "[{name: 'type', type:'string'},        // Output schema
       {name: 'key', type:'string'},
       {name: 'value', type:'string'}]",
       "function(r, emit) {                    // The function
          x = JSON.parse(r.json);
          processKey(x, '');
          function processKey(node, parent) {
            if (parent !== '') {parent += '.'};
            Object.keys(node).map(function(key) {
              value = node[key].toString();
              if (value !== '[object Object]') {
                emit({type:r.type, key:parent + key, value:value});
              } else {
                processKey(node[key], parent + key);
              };
            });         
          };
        }"
      )
    GROUP EACH BY type, key, value
    ORDER BY weight DESC
    LIMIT 1000
    

    The result is as below

    Row          type   key                 value                         weight     
    1   /type/edition   type.key            /type/edition               25140209     
    2   /type/edition   last_modified.type  /type/datetime              25140209     
    3   /type/edition   created.type        /type/datetime              17092292     
    4   /type/edition   languages.0.key     /languages/eng              14514830     
    5   /type/edition   notes.type          /type/text                  11681480     
    6   /type/edition   revision            2                            8714084     
    7   /type/edition   latest_revision     2                            8704217     
    8   /type/edition   revision            3                            5041680     
    9   /type/edition   latest_revision     3                            5040634     
    10  /type/edition   created.value       2008-04-01T03:28:50.625462   3579095     
    11  /type/edition   revision            1                            3396868     
    12  /type/edition   physical_format     Paperback                    3181270     
    13  /type/edition   revision            4                            3053266     
    14  /type/edition   latest_revision     4                            3053197     
    15  /type/edition   revision            5                            2076094     
    16  /type/edition   latest_revision     5                            2076072     
    17  /type/edition   publish_country     nyu                          1727347     
    18  /type/edition   created.value       2008-04-30T09:38:13.731961   1681227     
    19  /type/edition   publish_country     enk                          1627969     
    20  /type/edition   publish_places      London                       1613755     
    21  /type/edition   physical_format     Hardcover                    1495864     
    22  /type/edition   publish_places      New York                     1467779     
    23  /type/edition   revision            6                            1437467     
    24  /type/edition   latest_revision     6                            1437463     
    25  /type/edition   publish_country     xxk                          1407624 
    
    0 讨论(0)
提交回复
热议问题