elasticsearch group-by multiple fields

前端 未结 4 1671
孤街浪徒
孤街浪徒 2020-12-14 10:42

I am Looking for the best way to group data in elasticsearch. Elasticsearch doesn\'t support something like \'group by\' in sql.

Lets say I have 1k categories and mi

4条回答
  •  陌清茗
    陌清茗 (楼主)
    2020-12-14 11:23

    The aggregations API allows grouping by multiple fields, using sub-aggregations. Suppose you want to group by fields field1, field2 and field3:

    {
      "aggs": {
        "agg1": {
          "terms": {
            "field": "field1"
          },
          "aggs": {
            "agg2": {
              "terms": {
                "field": "field2"
              },
              "aggs": {
                "agg3": {
                  "terms": {
                    "field": "field3"
                  }
                }
              }          
            }
          }
        }
      }
    }
    

    Of course this can go on for as many fields as you'd like.

    Update:
    For completeness, here is how the output of the above query looks. Also below is python code for generating the aggregation query and flattening the result into a list of dictionaries.

    {
      "aggregations": {
        "agg1": {
          "buckets": [{
            "doc_count": ,
            "key": ,
            "agg2": {
              "buckets": [{
                "doc_count": ,
                "key": ,
                "agg3": {
                  "buckets": [{
                    "doc_count": ,
                    "key": 
                  },
                  {
                    "doc_count": ,
                    "key": 
                  }, ...
                  ]
                },
                {
                "doc_count": ,
                "key": ,
                "agg3": {
                  "buckets": [{
                    "doc_count": ,
                    "key": 
                  },
                  {
                    "doc_count": ,
                    "key": 
                  }, ...
                  ]
                }, ...
              ]
            },
            {
            "doc_count": ,
            "key": ,
            "agg2": {
              "buckets": [{
                "doc_count": ,
                "key": ,
                "agg3": {
                  "buckets": [{
                    "doc_count": ,
                    "key": 
                  },
                  {
                    "doc_count": ,
                    "key": 
                  }, ...
                  ]
                },
                {
                "doc_count": ,
                "key": ,
                "agg3": {
                  "buckets": [{
                    "doc_count": ,
                    "key": 
                  },
                  {
                    "doc_count": ,
                    "key": 
                  }, ...
                  ]
                }, ...
              ]
            }, ...
          ]
        }
      }
    }
    

    The following python code performs the group-by given the list of fields. I you specify include_missing=True, it also includes combinations of values where some of the fields are missing (you don't need it if you have version 2.0 of Elasticsearch thanks to this)

    def group_by(es, fields, include_missing):
        current_level_terms = {'terms': {'field': fields[0]}}
        agg_spec = {fields[0]: current_level_terms}
    
        if include_missing:
            current_level_missing = {'missing': {'field': fields[0]}}
            agg_spec[fields[0] + '_missing'] = current_level_missing
    
        for field in fields[1:]:
            next_level_terms = {'terms': {'field': field}}
            current_level_terms['aggs'] = {
                field: next_level_terms,
            }
    
            if include_missing:
                next_level_missing = {'missing': {'field': field}}
                current_level_terms['aggs'][field + '_missing'] = next_level_missing
                current_level_missing['aggs'] = {
                    field: next_level_terms,
                    field + '_missing': next_level_missing,
                }
                current_level_missing = next_level_missing
    
            current_level_terms = next_level_terms
    
        agg_result = es.search(body={'aggs': agg_spec})['aggregations']
        return get_docs_from_agg_result(agg_result, fields, include_missing)
    
    
    def get_docs_from_agg_result(agg_result, fields, include_missing):
        current_field = fields[0]
        buckets = agg_result[current_field]['buckets']
        if include_missing:
            buckets.append(agg_result[(current_field + '_missing')])
    
        if len(fields) == 1:
            return [
                {
                    current_field: bucket.get('key'),
                    'doc_count': bucket['doc_count'],
                }
                for bucket in buckets if bucket['doc_count'] > 0
            ]
    
        result = []
        for bucket in buckets:
            records = get_docs_from_agg_result(bucket, fields[1:], include_missing)
            value = bucket.get('key')
            for record in records:
                record[current_field] = value
            result.extend(records)
    
        return result
    

提交回复
热议问题