bigquery udf out of memory issues

问题

When running a relatively simple UDF on a dataset in Google's BigQuery, I have run into the following error:

Resources exceeded during query execution: UDF out of memory.

Running the UDF on a single row, or even 3 rows, the function works fine, and outputs ca. 150,000 rows per UDF run to a separate table. An example which ran on 2 rows and produced output has job-ID broad-cga-het:bquijob_48bd79dc_155ffe83f48. However, when running the UDF on 5+ rows, the function fails. An example for which the udf fails in this case has job-ID broad-cga-het:bquijob_1248b517_155ffeb2a10. All of the rows in the input table are identical, so the issue cannot be an abnormally large input row size compared to the rows I have tested. I am running the udf on a data set with 10,000 rows of total size about 300 MB, so ca. 30Kb per row.

This problem was addressed in the following post,

BigQuery UDF memory exceeded error on multiple rows but works fine on single row

however, I tried implementing some of the possible solutions posted here (ran a GROUP EACH BY on all variables based on row number in SELECT input query for the udf, but this did not help), and I was unable to find a clear solution to this issue elsewhere.

I thought that garbage collection may not be working properly within the environment the javascript is running, but am unsure how to check this (do bigquery udfs create a log file anywhere?)

Here is the bigquery udf I am running:

bigquery.defineFunction(
  'permute',
  ['obj_nums','num_obj_per_indiv','row_number'], // Names of input columns
  [{name: 'obj_pair', type: 'string'}, {name: 'perm_run_id', type: 'integer'}],  // Output schema
  permute
);

function permute(row, emit) {

        var obj_ids = row['obj_nums'].split(",").map(function (x) { 
            return parseInt(x, 10); 
        });

        var num_obj_per_indiv = row['num_obj_per_indiv'].split(",").map(function (x) { 
            return parseInt(x, 10); 
        });

        var row_number = row['row_number']

        // randomly shuffle objs using Durstenfeld shuffle algorithm
        obj_ids = shuffle_objs(obj_ids);

        // use fixed number of objs per indiv and draw from shuffled objs
        var obj_pair_total_perm_coocur = {};
        var perm_cooccur_dict = {};
        //num_obj_per_indiv = num_obj_per_indiv.slice(0,3);

        for(var index in num_obj_per_indiv) {

                var obj_count = num_obj_per_indiv[index]
                var perm_run_objs = [];
                for(var j = 0; j < obj_count; j++) {
                        perm_run_objs.push(obj_ids.pop());
                }

                perm_run_objs = new Set(perm_run_objs);
                perm_run_objs = Array.from(perm_run_objs)
                while(perm_run_objs.length > 1) {

                         current_obj = perm_run_objs.pop()
                         for(var pair_obj_ind in perm_run_objs) {
                                  var pair_obj = perm_run_objs[pair_obj_ind]
                                  // console.log({"obj_pair":[current_obj,pair_obj].sort().join("_"),"perm_run_id":row_number})
                                  emit({"obj_pair":[current_obj,pair_obj].sort().join("_"),"perm_run_id":row_number});

                          }
                 }
        }
}

/**
 * Randomize array element order in-place.
 * Using Durstenfeld shuffle algorithm.
 */
function shuffle_objs(obj_array) {
    for (var i = obj_array.length - 1; i > 0; i--) {
        var j = Math.floor(Math.random() * (i + 1));
        var temp = obj_array[i];
        obj_array[i] = obj_array[j];
        obj_array[j] = temp;
    }
        return obj_array;
}

Any help on this issue would be tremendous! Thank you.

来源：https://stackoverflow.com/questions/38414883/bigquery-udf-out-of-memory-issues

标签

google-bigquery

udf