Crosstab with a large or undefined number of categories

后端 未结 3 509
野的像风
野的像风 2020-12-05 21:54

My real problem has to do with recording which of a very large number of anti-virus products agree that a given sample is a member of a given anti-virus family. The databas

3条回答
  •  臣服心动
    2020-12-05 22:29

    Using the same sample data as Clodoaldo ("create table vote...") and using the plpythonu function make_pivot_table (below), you can run:

    create temp table pivot_data on commit drop as 
        select * from vote where decision = 'Cat' union select photo, null, null from vote;
    
    select * from make_pivot_table('{photo}', 'voter',  'decision', 'count', 'pivot_data',
      'pivot_result', false);
    
    select * from pivot_result order by photo;
    

    The make_pivot_table function definition is:

    -- make_pivot_table
    -- python version 0.9
    -- last edited 2015-08-11 
    
    create or replace function
     make_pivot_table(row_headers text[], category_field text, value_field text,
      value_action text, input_table text, output_table text, keep_result boolean)
    returns void as
    $$
    # imports
    from collections import defaultdict
    import operator
    import string
    
    # constants
    BATCH_SIZE = 100
    VALID_ACTIONS = ('count', 'sum', 'min', 'max')
    NULL_CATEGORY_NAME = 'NULL_CATEGORY'
    TOTAL_COL = 'total'
    
    # functions
    def table_exists(tablename):
        plan = plpy.prepare("""select table_schema, table_name from
            information_schema.Tables where table_schema not in ('information_schema',
            'pg_catalog') and table_name = $1""", ["text"])
        rows = plpy.execute(plan, [input_table], 2)
        return bool(rows)
    
    def make_rowkey(row):
        return tuple([row[header] for header in row_headers])
    
    def quote_if_needed(value):
        return plpy.quote_literal(value) if isinstance(value, basestring) else str(value)
    
    # assumes None is never a value in the dct
    def update_if(dct, key, new_value, op, result=True):
        current_value = dct.get(key)
        if current_value is None or op(value, current_value) == result:
            dct[key] = new_value
    
    def update_output_table(output_table, row_headers, colname, value):
        pg_value = plpy.quote_literal(value) if isinstance(value, basestring) else value
        sql = 'update %s set %s = %s where ' % (output_table, plpy.quote_ident(colname), 
                                                pg_value)
        conditions = []
        for index, row_header in enumerate(row_headers):
            conditions.append('%s = %s' % (plpy.quote_ident(row_header),
                                           quote_if_needed(rowkey[index])))
        sql += ' and '.join(conditions)
        plpy.execute(sql)
    
    
    # -----------------
    
    if not table_exists(input_table):
        plpy.error('input_table %s dones not exist' % input_table)
    
    if value_action not in VALID_ACTIONS:
        plpy.error('%s is not a recognised action' % value_action)
    
    # load the data into a dict
    count_dict = defaultdict(int)
    sum_dict = defaultdict(float)
    total_dict = defaultdict(float)
    min_dict = dict()
    max_dict = dict()
    categories_seen = set()
    rowkeys_seen = set()
    do_total = value_action in ('count', 'sum')
    
    cursor = plpy.cursor('select * from %s' % plpy.quote_ident(input_table))
    while True:
        rows = cursor.fetch(BATCH_SIZE)
        if not rows:
            break
        for row in rows:
            rowkey = make_rowkey(row)
            rowkeys_seen.add(rowkey)
            category = row[category_field]           
            value = row[value_field]
            dctkey = (rowkey, category)
    
            # skip if value field is null
            if value is None:
                continue
    
            categories_seen.add(category)
    
            if value_action == 'count':
            count_dict[dctkey] += 1
            total_dict[rowkey] += 1
        if value_action == 'sum':
                sum_dict[dctkey] += value
                total_dict[rowkey] += value
            if value_action == 'min':
                update_if(min_dict, dctkey, value, operator.lt)
            if value_action == 'max':
                update_if(max_dict, dctkey, value, operator.gt)
    
    plpy.notice('seen %s summary rows and %s categories' % (len(rowkeys_seen),
                                                            len(categories_seen)))
    
    # get the columns types
    coltype_dict = dict()
    input_type_sql = 'select * from %s where false' % plpy.quote_ident(input_table)
    input_type_result = plpy.execute(input_type_sql)
    for index, colname in enumerate(input_type_result.colnames()):
        coltype_num = input_type_result.coltypes()[index]
        coltype_sql = 'select typname from pg_type where oid = %s' % coltype_num
        coltype = list(plpy.cursor(coltype_sql))[0]
        plpy.notice('%s: %s' % (colname, coltype['typname']))
        coltype_dict[colname] = coltype['typname']
    
    plpy.execute('drop table if exists %s' % plpy.quote_ident(output_table))
    sql_parts = []
    if keep_result:
        sql_parts.append('create table %s (' % plpy.quote_ident(output_table))
    else:
        sql_parts.append('create temp table %s (' % plpy.quote_ident(output_table))
    
    cols = []
    for row_header in row_headers:
        cols.append('%s %s' % (plpy.quote_ident(row_header), coltype_dict[row_header]))
    
    cat_type = 'bigint' if value_action == 'count' else coltype_dict[value_field]
    
    for col in sorted(categories_seen):
        if col is None:
            cols.append('%s %s' % (plpy.quote_ident(NULL_CATEGORY_NAME), cat_type))
        else:
            cols.append('%s %s' % (plpy.quote_ident(col), cat_type))
    
    if do_total:
        cols.append('%s %s' % (TOTAL_COL, cat_type))
    
    sql_parts.append(',\n'.join(cols))
    if keep_result:
        sql_parts.append(')')
    else:
        sql_parts.append(') on commit drop')
    plpy.execute('\n'.join(sql_parts))
    
    dict_map = {'count': count_dict, 'sum': sum_dict, 'min': min_dict, 'max': max_dict }
    value_dict = dict_map[value_action]
    for rowkey in rowkeys_seen:
        sql = 'insert into %s values (' % plpy.quote_ident(output_table)
        sql += ', '.join([quote_if_needed(part) for part in rowkey])
        sql += ')'
        plpy.execute(sql)
    
    if do_total:
        for rowkey, value in total_dict.iteritems():
            update_output_table(output_table, row_headers, TOTAL_COL, value)
    
    for (rowkey, category), value in value_dict.iteritems():
        # put in cateogry value
        colname = NULL_CATEGORY_NAME if category is None else category
        update_output_table(output_table, row_headers, colname, value)
    
    $$ language plpythonu
    

提交回复
热议问题