问题
Suppose I have view of data in bigquery
Person | Amount | yearMonth
---------------------------
AA | 100 | 201701
AA | 200 | 201702
AA | 300 | 201703
AA | 70 | 201704
AB | 10 | 201701
AB | 50 | 201702
AB | 60 | 201703
AB | 70 | 201704
AC | 70 | 201701
AC | 80 | 201702
AC | 30 | 201703
AC | 10 | 201704
Now, I need to get the average of this for the last 3 months for every person every month
Expected Results:
Person | Amount | yearMonth
---------------------------
AA | 200 | 201703(avg of 201701-201703)
AA | 190 | 201704(avg of 201702-201704)
AB | 40 | 201703(avg of 201701-201703)
AB | 60 | 201704(avg of 201702-201704)
AC | 60 | 201703(avg of 201701-201703)
AC | 40 | 201704(avg of 201702-201704)
How is this calculated?
First Row
- AA = 200, comes from 100(201701)+200(201702)+300(201703)/3 = 200
- AA = 100, comes from 200(201702)+300(201703)+70(201704)/3 = 190
- AB = 40, comes from 10(201701)+50(201702)+60(201703)/3 = 40
- and soon
I am not really sure how to group by this. I don't mind if your answer has a link to this issue.
Thanks heaps
Is It also possible in legacy SQL? I haven't migrated to standardSQL yet. my view is in legacy SQL
回答1:
Below is for BigQuery Standard SQL (at least should give you an idea on the logic of proper grouping)
#standardSQL
SELECT
person, yearMonth, CAST(amount AS INT64) amount
FROM (
SELECT
person, yearMonth, dt,
AVG(amount) OVER(PARTITION BY person ORDER BY dt RANGE BETWEEN 63 PRECEDING AND CURRENT row) amount,
COUNT(1) OVER(PARTITION BY person ORDER BY dt RANGE BETWEEN 63 PRECEDING AND CURRENT row) months
FROM (
SELECT
person, amount, yearMonth,
UNIX_DATE(DATE(DIV(yearMonth, 100), MOD(yearMonth, 100), 1)) AS dt
FROM `project.dataset.table`
)
)
WHERE months = 3
-- ORDER BY person, yearMonth
You can test / play with it with dummy data as below
#standardSQL
WITH `project.dataset.table` AS (
SELECT 'AA' person, 100 amount, 201701 yearMonth UNION ALL
SELECT 'AA', 200, 201702 UNION ALL
SELECT 'AA', 300, 201703 UNION ALL
SELECT 'AA', 70, 201704 UNION ALL
SELECT 'AB', 10, 201701 UNION ALL
SELECT 'AB', 50, 201702 UNION ALL
SELECT 'AB', 60, 201703 UNION ALL
SELECT 'AB', 70, 201704 UNION ALL
SELECT 'AC', 70, 201701 UNION ALL
SELECT 'AC', 80, 201702 UNION ALL
SELECT 'AC', 30, 201703 UNION ALL
SELECT 'AC', 10, 201704
)
SELECT
person, yearMonth, CAST(amount AS INT64) amount
FROM (
SELECT
person, yearMonth, dt,
AVG(amount) OVER(PARTITION BY person ORDER BY dt RANGE BETWEEN 63 PRECEDING AND CURRENT row) amount,
COUNT(1) OVER(PARTITION BY person ORDER BY dt RANGE BETWEEN 63 PRECEDING AND CURRENT row) months
FROM (
SELECT
person, amount, yearMonth,
UNIX_DATE(DATE(DIV(yearMonth, 100), MOD(yearMonth, 100), 1)) AS dt
FROM `project.dataset.table`
)
)
WHERE months = 3
ORDER BY person, yearMonth
Output is as expected
person yearMonth amount
AA 201703 200
AA 201704 190
AB 201703 40
AB 201704 60
AC 201703 60
AC 201704 40
Added version for BigQuery Legacy SQL
#legacySQL
SELECT
person, yearMonth, INTEGER(amount) amount
FROM (
SELECT
person, yearMonth, dt,
AVG(amount) OVER(PARTITION BY person ORDER BY dt range BETWEEN 63*60*60*24 preceding AND current row) amount,
COUNT(1) OVER(PARTITION BY person ORDER BY dt range BETWEEN 63*60*60*24 preceding AND current row) months
FROM (
SELECT
person, amount, yearMonth,
TIMESTAMP_TO_SEC(TIMESTAMP(CONCAT(STRING(INTEGER(yearMonth/100)), '-', SUBSTR(STRING(100 + yearMonth % 100), 2, 2), '-01'))) AS dt
FROM [project:dataset.table]
)
)
WHERE months = 3
-- ORDER BY person, yearMonth
You can test / play with it using below example with dummy data
#legacySQL
SELECT
person, yearMonth, INTEGER(amount) amount
FROM (
SELECT
person, yearMonth, dt,
AVG(amount) OVER(PARTITION BY person ORDER BY dt range BETWEEN 63*60*60*24 preceding AND current row) amount,
COUNT(1) OVER(PARTITION BY person ORDER BY dt range BETWEEN 63*60*60*24 preceding AND current row) months
FROM (
SELECT
person, amount, yearMonth,
TIMESTAMP_TO_SEC(TIMESTAMP(CONCAT(STRING(INTEGER(yearMonth/100)), '-', SUBSTR(STRING(100 + yearMonth % 100), 2, 2), '-01'))) AS dt
FROM -- [project:dataset.table]
(SELECT 'AA' person, 100 amount, 201701 yearMonth),
(SELECT 'AA' person, 200 amount, 201702 yearMonth),
(SELECT 'AA' person, 300 amount, 201703 yearMonth),
(SELECT 'AA' person, 70 amount, 201704 yearMonth),
(SELECT 'AB' person, 10 amount, 201701 yearMonth),
(SELECT 'AB' person, 50 amount, 201702 yearMonth),
(SELECT 'AB' person, 60 amount, 201703 yearMonth),
(SELECT 'AB' person, 70 amount, 201704 yearMonth),
(SELECT 'AC' person, 70 amount, 201701 yearMonth),
(SELECT 'AC' person, 80 amount, 201702 yearMonth),
(SELECT 'AC' person, 30 amount, 201703 yearMonth),
(SELECT 'AC' person, 10 amount, 201704 yearMonth)
)
)
WHERE months = 3
ORDER BY person, yearMonth
来源:https://stackoverflow.com/questions/47047859/aggregate-query-3-month-average-per-person-for-every-month