aggregate QUERY - 3 month average per person for every month

我只是一个虾纸丫 提交于 2019-12-11 07:28:15

问题


Suppose I have view of data in bigquery

Person | Amount | yearMonth
---------------------------
AA     |   100  |   201701
AA     |   200  |   201702
AA     |   300  |   201703
AA     |   70   |   201704
AB     |   10   |   201701
AB     |   50   |   201702
AB     |   60   |   201703
AB     |   70   |   201704
AC     |   70   |   201701
AC     |   80   |   201702
AC     |   30   |   201703
AC     |   10   |   201704

Now, I need to get the average of this for the last 3 months for every person every month

Expected Results:

Person | Amount | yearMonth
---------------------------
AA     |   200  |   201703(avg of 201701-201703)
AA     |   190  |   201704(avg of 201702-201704)
AB     |   40   |   201703(avg of 201701-201703)
AB     |   60   |   201704(avg of 201702-201704)
AC     |   60   |   201703(avg of 201701-201703)
AC     |   40   |   201704(avg of 201702-201704)

How is this calculated?

First Row

  • AA = 200, comes from 100(201701)+200(201702)+300(201703)/3 = 200
  • AA = 100, comes from 200(201702)+300(201703)+70(201704)/3 = 190
  • AB = 40, comes from 10(201701)+50(201702)+60(201703)/3 = 40
  • and soon

I am not really sure how to group by this. I don't mind if your answer has a link to this issue.

Thanks heaps

Is It also possible in legacy SQL? I haven't migrated to standardSQL yet. my view is in legacy SQL


回答1:


Below is for BigQuery Standard SQL (at least should give you an idea on the logic of proper grouping)

#standardSQL
SELECT
  person, yearMonth, CAST(amount AS INT64) amount
FROM (
  SELECT
    person, yearMonth, dt,
    AVG(amount) OVER(PARTITION BY person ORDER BY dt RANGE BETWEEN 63 PRECEDING AND CURRENT row) amount,
    COUNT(1) OVER(PARTITION BY person ORDER BY dt RANGE BETWEEN 63 PRECEDING AND CURRENT row) months
  FROM (
    SELECT 
      person, amount, yearMonth, 
      UNIX_DATE(DATE(DIV(yearMonth, 100), MOD(yearMonth, 100), 1)) AS dt
    FROM `project.dataset.table`
  )
)
WHERE months = 3
-- ORDER BY person, yearMonth

You can test / play with it with dummy data as below

#standardSQL
WITH `project.dataset.table` AS (
  SELECT 'AA' person, 100 amount, 201701 yearMonth UNION ALL
  SELECT 'AA', 200, 201702 UNION ALL
  SELECT 'AA', 300, 201703 UNION ALL
  SELECT 'AA', 70, 201704 UNION ALL
  SELECT 'AB', 10, 201701 UNION ALL
  SELECT 'AB', 50, 201702 UNION ALL
  SELECT 'AB', 60, 201703 UNION ALL
  SELECT 'AB', 70, 201704 UNION ALL
  SELECT 'AC', 70, 201701 UNION ALL
  SELECT 'AC', 80, 201702 UNION ALL
  SELECT 'AC', 30, 201703 UNION ALL
  SELECT 'AC', 10, 201704 
)
SELECT
  person, yearMonth, CAST(amount AS INT64) amount
FROM (
  SELECT
    person, yearMonth, dt,
    AVG(amount) OVER(PARTITION BY person ORDER BY dt RANGE BETWEEN 63 PRECEDING AND CURRENT row) amount,
    COUNT(1) OVER(PARTITION BY person ORDER BY dt RANGE BETWEEN 63 PRECEDING AND CURRENT row) months
  FROM (
    SELECT 
      person, amount, yearMonth, 
      UNIX_DATE(DATE(DIV(yearMonth, 100), MOD(yearMonth, 100), 1)) AS dt
    FROM `project.dataset.table`
  )
)
WHERE months = 3
ORDER BY person, yearMonth

Output is as expected

person  yearMonth   amount   
AA      201703      200  
AA      201704      190  
AB      201703      40   
AB      201704      60   
AC      201703      60   
AC      201704      40    

Added version for BigQuery Legacy SQL

#legacySQL
SELECT
  person, yearMonth, INTEGER(amount) amount
FROM (
  SELECT
    person, yearMonth, dt,
    AVG(amount) OVER(PARTITION BY person ORDER BY dt range BETWEEN 63*60*60*24 preceding AND current row) amount,
    COUNT(1) OVER(PARTITION BY person ORDER BY dt range BETWEEN 63*60*60*24 preceding AND current row) months
  FROM (
    SELECT 
      person, amount, yearMonth, 
      TIMESTAMP_TO_SEC(TIMESTAMP(CONCAT(STRING(INTEGER(yearMonth/100)), '-', SUBSTR(STRING(100 + yearMonth % 100), 2, 2), '-01'))) AS dt
    FROM [project:dataset.table]
  )
)
WHERE months = 3
-- ORDER BY person, yearMonth

You can test / play with it using below example with dummy data

#legacySQL
SELECT
  person, yearMonth, INTEGER(amount) amount
FROM (
  SELECT
    person, yearMonth, dt,
    AVG(amount) OVER(PARTITION BY person ORDER BY dt range BETWEEN 63*60*60*24 preceding AND current row) amount,
    COUNT(1) OVER(PARTITION BY person ORDER BY dt range BETWEEN 63*60*60*24 preceding AND current row) months
  FROM (
    SELECT 
      person, amount, yearMonth, 
      TIMESTAMP_TO_SEC(TIMESTAMP(CONCAT(STRING(INTEGER(yearMonth/100)), '-', SUBSTR(STRING(100 + yearMonth % 100), 2, 2), '-01'))) AS dt
    FROM -- [project:dataset.table]
      (SELECT 'AA' person, 100 amount, 201701 yearMonth),
      (SELECT 'AA' person, 200 amount, 201702 yearMonth),
      (SELECT 'AA' person, 300 amount, 201703 yearMonth),
      (SELECT 'AA' person, 70 amount, 201704 yearMonth),
      (SELECT 'AB' person, 10 amount, 201701 yearMonth),
      (SELECT 'AB' person, 50 amount, 201702 yearMonth),
      (SELECT 'AB' person, 60 amount, 201703 yearMonth),
      (SELECT 'AB' person, 70 amount, 201704 yearMonth),
      (SELECT 'AC' person, 70 amount, 201701 yearMonth),
      (SELECT 'AC' person, 80 amount, 201702 yearMonth),
      (SELECT 'AC' person, 30 amount, 201703 yearMonth),
      (SELECT 'AC' person, 10 amount, 201704 yearMonth)
  )
)
WHERE months = 3
ORDER BY person, yearMonth


来源:https://stackoverflow.com/questions/47047859/aggregate-query-3-month-average-per-person-for-every-month

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!