问题
I have the following data
with dummy_data as
(
SELECT '2017-01-01' as ref_month, 18 as value, 1 as id
UNION ALL SELECT '2017-02-01' as ref_month, 20 as value, 1 as id
UNION ALL SELECT '2017-03-01' as ref_month, 22 as value, 1 as id
-- UNION ALL SELECT '2017-04-01' as ref_month, 28 as value, 1 as id
UNION ALL SELECT '2017-05-01' as ref_month, 30 as value, 1 as id
UNION ALL SELECT '2017-06-01' as ref_month, 37 as value, 1 as id
UNION ALL SELECT '2017-07-01' as ref_month, 42 as value, 1 as id
-- UNION ALL SELECT '2017-08-01' as ref_month, 55 as value, 1 as id
-- UNION ALL SELECT '2017-09-01' as ref_month, 49 as value, 1 as id
UNION ALL SELECT '2017-10-01' as ref_month, 51 as value, 1 as id
UNION ALL SELECT '2017-11-01' as ref_month, 57 as value, 1 as id
UNION ALL SELECT '2017-12-01' as ref_month, 56 as value, 1 as id
UNION ALL SELECT '2017-01-01' as ref_month, 18 as value, 2 as id
UNION ALL SELECT '2017-02-01' as ref_month, 20 as value, 2 as id
UNION ALL SELECT '2017-03-01' as ref_month, 22 as value, 2 as id
UNION ALL SELECT '2017-04-01' as ref_month, 28 as value, 2 as id
-- UNION ALL SELECT '2017-05-01' as ref_month, 30 as value, 2 as id
-- UNION ALL SELECT '2017-06-01' as ref_month, 37 as value, 2 as id
UNION ALL SELECT '2017-07-01' as ref_month, 42 as value, 2 as id
UNION ALL SELECT '2017-08-01' as ref_month, 55 as value, 2 as id
UNION ALL SELECT '2017-09-01' as ref_month, 49 as value, 2 as id
-- UNION ALL SELECT '2017-10-01' as ref_month, 51 as value, 2 as id
UNION ALL SELECT '2017-11-01' as ref_month, 57 as value, 2 as id
UNION ALL SELECT '2017-12-01' as ref_month, 56 as value, 2 as id
)
And i want to calculate the moving average for each id. I know you can do something like the following
select
id
, ref_month
, avg(value) over (partition by id order by ref_month ROWS BETWEEN 5 PRECEDING AND CURRENT ROW ) as moving_avg
from
dummy_data
but as you can see from my dummy data, there are some missing values. Any ideas on how to calculate the moving average easily when there are some missing values? I was thinking to calculate first a full date range
date_range AS
(
SELECT reference_month
FROM UNNEST(
GENERATE_DATE_ARRAY(PARSE_DATE('%Y-%m-%d', (SELECT MIN(ref_month) FROM dummy_data)), PARSE_DATE('%Y-%m-%d', (SELECT MAX(ref_month) FROM dummy_data)), INTERVAL 1 MONTH)
) AS reference_month
)
and then do a cartesian products with the ids and then join back with my dummy data, but this seems an anti pattern. Any idea on how to do this optimally? Thanks
EDIT:
expected result: For id 1:
2017-01-01 18
2017-02-01 19
2017-03-01 20
2017-05-01 18
2017-06-01 21.8
2017-07-01 26.2
2017-10-01 26
2017-11-01 30
2017-12-01 32.8
For id 2:
2017-01-01 18
2017-02-01 19
2017-03-01 20
2017-04-01 22
2017-07-01 18.4
2017-08-01 25
2017-09-01 29.2
2017-11-01 40.6
2017-12-01 43.4
回答1:
Below is for BigQuery Standard SQL and actually works! :o)
It assumes that your ref_month is of DATE
data type (if in your case you have it as STRING
- still Okay - see note at the very bottom of my answer)
#standardSQL
SELECT
id,
ref_month,
ROUND(SUM(value) OVER (rolling_six_days) /
(LAST_VALUE(month_pos) OVER (rolling_six_days)
- FIRST_VALUE(month_pos) OVER (rolling_six_days)
+ 1)
) AS correct_moving_avg
FROM (
SELECT id, ref_month, value,
DATE_DIFF(ref_month, '2016-01-01', MONTH) month_pos
FROM dummy_data
)
WINDOW rolling_six_days AS
(PARTITION BY id ORDER BY month_pos RANGE BETWEEN 5 PRECEDING AND CURRENT ROW )
You can test / play with it using your example data as below
#standardSQL
WITH dummy_data AS (
SELECT DATE '2017-01-01' AS ref_month, 18 AS value, 1 AS id
UNION ALL SELECT DATE '2017-02-01' AS ref_month, 20 AS value, 1 AS id
UNION ALL SELECT DATE '2017-03-01' AS ref_month, 22 AS value, 1 AS id
-- UNION ALL SELECT DATE '2017-04-01' as ref_month, 28 as value, 1 as id
UNION ALL SELECT DATE '2017-05-01' AS ref_month, 30 AS value, 1 AS id
UNION ALL SELECT DATE '2017-06-01' AS ref_month, 37 AS value, 1 AS id
UNION ALL SELECT DATE '2017-07-01' AS ref_month, 42 AS value, 1 AS id
-- UNION ALL SELECT DATE '2017-08-01' as ref_month, 55 as value, 1 as id
-- UNION ALL SELECT DATE '2017-09-01' as ref_month, 49 as value, 1 as id
UNION ALL SELECT DATE '2017-10-01' AS ref_month, 51 AS value, 1 AS id
UNION ALL SELECT DATE '2017-11-01' AS ref_month, 57 AS value, 1 AS id
UNION ALL SELECT DATE '2017-12-01' AS ref_month, 56 AS value, 1 AS id
UNION ALL SELECT DATE '2017-01-01' AS ref_month, 18 AS value, 2 AS id
UNION ALL SELECT DATE '2017-02-01' AS ref_month, 20 AS value, 2 AS id
UNION ALL SELECT DATE '2017-03-01' AS ref_month, 22 AS value, 2 AS id
UNION ALL SELECT DATE '2017-04-01' AS ref_month, 28 AS value, 2 AS id
-- UNION ALL SELECT DATE '2017-05-01' as ref_month, 30 as value, 2 as id
-- UNION ALL SELECT DATE '2017-06-01' as ref_month, 37 as value, 2 as id
UNION ALL SELECT DATE '2017-07-01' AS ref_month, 42 AS value, 2 AS id
UNION ALL SELECT DATE '2017-08-01' AS ref_month, 55 AS value, 2 AS id
UNION ALL SELECT DATE '2017-09-01' AS ref_month, 49 AS value, 2 AS id
-- UNION ALL SELECT DATE '2017-10-01' as ref_month, 51 as value, 2 as id
UNION ALL SELECT DATE '2017-11-01' AS ref_month, 57 AS value, 2 AS id
UNION ALL SELECT DATE '2017-12-01' AS ref_month, 56 AS value, 2 AS id
)
SELECT
id,
ref_month,
ROUND(SUM(value) OVER (rolling_six_days) /
(LAST_VALUE(month_pos) OVER (rolling_six_days)
- FIRST_VALUE(month_pos) OVER (rolling_six_days)
+ 1)
) AS correct_moving_avg
FROM (
SELECT id, ref_month, value,
DATE_DIFF(ref_month, '2016-01-01', MONTH) month_pos
FROM dummy_data
)
WINDOW rolling_six_days AS (PARTITION BY id ORDER BY month_pos RANGE BETWEEN 5 PRECEDING AND CURRENT ROW )
ORDER BY 1,2
To help you in exploring logic - see below "expanded" version of above query - it has all even intermediate values propagated up to very outside select so you can see everything ...
#standardSQL
WITH dummy_data AS
(
SELECT DATE '2017-01-01' AS ref_month, 18 AS value, 1 AS id
UNION ALL SELECT DATE '2017-02-01' AS ref_month, 20 AS value, 1 AS id
UNION ALL SELECT DATE '2017-03-01' AS ref_month, 22 AS value, 1 AS id
-- UNION ALL SELECT DATE '2017-04-01' as ref_month, 28 as value, 1 as id
UNION ALL SELECT DATE '2017-05-01' AS ref_month, 30 AS value, 1 AS id
UNION ALL SELECT DATE '2017-06-01' AS ref_month, 37 AS value, 1 AS id
UNION ALL SELECT DATE '2017-07-01' AS ref_month, 42 AS value, 1 AS id
-- UNION ALL SELECT DATE '2017-08-01' as ref_month, 55 as value, 1 as id
-- UNION ALL SELECT DATE '2017-09-01' as ref_month, 49 as value, 1 as id
UNION ALL SELECT DATE '2017-10-01' AS ref_month, 51 AS value, 1 AS id
UNION ALL SELECT DATE '2017-11-01' AS ref_month, 57 AS value, 1 AS id
UNION ALL SELECT DATE '2017-12-01' AS ref_month, 56 AS value, 1 AS id
UNION ALL SELECT DATE '2017-01-01' AS ref_month, 18 AS value, 2 AS id
UNION ALL SELECT DATE '2017-02-01' AS ref_month, 20 AS value, 2 AS id
UNION ALL SELECT DATE '2017-03-01' AS ref_month, 22 AS value, 2 AS id
UNION ALL SELECT DATE '2017-04-01' AS ref_month, 28 AS value, 2 AS id
-- UNION ALL SELECT DATE '2017-05-01' as ref_month, 30 as value, 2 as id
-- UNION ALL SELECT DATE '2017-06-01' as ref_month, 37 as value, 2 as id
UNION ALL SELECT DATE '2017-07-01' AS ref_month, 42 AS value, 2 AS id
UNION ALL SELECT DATE '2017-08-01' AS ref_month, 55 AS value, 2 AS id
UNION ALL SELECT DATE '2017-09-01' AS ref_month, 49 AS value, 2 AS id
-- UNION ALL SELECT DATE '2017-10-01' as ref_month, 51 as value, 2 as id
UNION ALL SELECT DATE '2017-11-01' AS ref_month, 57 AS value, 2 AS id
UNION ALL SELECT DATE '2017-12-01' AS ref_month, 56 AS value, 2 AS id
)
SELECT
id,
ref_month,
value,
moving_sum,
first_month,
last_month,
ROUND(moving_sum / (last_month - first_month + 1)) AS correct_moving_avg,
moving_avg
FROM (
SELECT
id,
ref_month,
value,
SUM(value) OVER (rolling_six_days) AS moving_sum,
FIRST_VALUE(month_pos) OVER (rolling_six_days) AS first_month,
LAST_VALUE(month_pos) OVER (rolling_six_days) AS last_month,
AVG(value) OVER (rolling_six_days) AS moving_avg
FROM (
SELECT
id, ref_month, value,
DATE_DIFF(ref_month, '2016-01-01', MONTH) month_pos
FROM dummy_data
)
WINDOW rolling_six_days AS
(PARTITION BY id ORDER BY month_pos RANGE BETWEEN 5 PRECEDING AND CURRENT ROW )
)
ORDER BY 1,2
with result as
id ref_month value moving_sum first_month last_month correct_moving_avg moving_avg
1 2017-01-01 18 18 12 12 18.0 18.0
1 2017-02-01 20 38 12 13 19.0 19.0
1 2017-03-01 22 60 12 14 20.0 20.0
1 2017-05-01 30 90 12 16 18.0 22.5
1 2017-06-01 37 127 12 17 21.0 25.4
1 2017-07-01 42 151 13 18 25.0 30.2
1 2017-10-01 51 160 16 21 27.0 40.0
1 2017-11-01 57 187 17 22 31.0 46.75
1 2017-12-01 56 206 18 23 34.0 51.5
2 2017-01-01 18 18 12 12 18.0 18.0
2 2017-02-01 20 38 12 13 19.0 19.0
2 2017-03-01 22 60 12 14 20.0 20.0
2 2017-04-01 28 88 12 15 22.0 22.0
2 2017-07-01 42 112 13 18 19.0 28.0
2 2017-08-01 55 147 14 19 25.0 36.75
2 2017-09-01 49 174 15 20 29.0 43.5
2 2017-11-01 57 203 18 22 41.0 50.75
2 2017-12-01 56 259 18 23 43.0 51.8
hope this shows/explains you the approach
Note: if your ref_month
field is of STRING` data tyoe you should slightly adjust line with DATE_DIFF - it should be as
DATE_DIFF(cast(ref_month as DATE), '2016-01-01', MONTH) month_pos
Note 2: I picked '2016-01-01' as a starting point for counting months - but you can pick any to make sure that it is less than your minimum date - for example '2000-01-01' will perfectly work too
回答2:
This should work:
with dummy_data as
(
SELECT '2017-01-01' as ref_month, 18 as value, 1 as id
UNION ALL SELECT '2017-02-01' as ref_month, 20 as value, 1 as id
UNION ALL SELECT '2017-03-01' as ref_month, 22 as value, 1 as id
-- UNION ALL SELECT '2017-04-01' as ref_month, 28 as value, 1 as id
UNION ALL SELECT '2017-05-01' as ref_month, 30 as value, 1 as id
UNION ALL SELECT '2017-06-01' as ref_month, 37 as value, 1 as id
UNION ALL SELECT '2017-07-01' as ref_month, 42 as value, 1 as id
-- UNION ALL SELECT '2017-08-01' as ref_month, 55 as value, 1 as id
-- UNION ALL SELECT '2017-09-01' as ref_month, 49 as value, 1 as id
UNION ALL SELECT '2017-10-01' as ref_month, 51 as value, 1 as id
UNION ALL SELECT '2017-11-01' as ref_month, 57 as value, 1 as id
UNION ALL SELECT '2017-12-01' as ref_month, 56 as value, 1 as id
UNION ALL SELECT '2017-01-01' as ref_month, 18 as value, 2 as id
UNION ALL SELECT '2017-02-01' as ref_month, 20 as value, 2 as id
UNION ALL SELECT '2017-03-01' as ref_month, 22 as value, 2 as id
UNION ALL SELECT '2017-04-01' as ref_month, 28 as value, 2 as id
-- UNION ALL SELECT '2017-05-01' as ref_month, 30 as value, 2 as id
-- UNION ALL SELECT '2017-06-01' as ref_month, 37 as value, 2 as id
UNION ALL SELECT '2017-07-01' as ref_month, 42 as value, 2 as id
UNION ALL SELECT '2017-08-01' as ref_month, 55 as value, 2 as id
UNION ALL SELECT '2017-09-01' as ref_month, 49 as value, 2 as id
-- UNION ALL SELECT '2017-10-01' as ref_month, 51 as value, 2 as id
UNION ALL SELECT '2017-11-01' as ref_month, 57 as value, 2 as id
UNION ALL SELECT '2017-12-01' as ref_month, 56 as value, 2 as id
)
select
id
, ref_month
, avg(avg(value)) over (partition by id order by ref_month) as moving_avg
from
dummy_data
group by id
, ref_month
回答3:
If you want to treat the values as 0 and you want "5", then a series of lag()
might be the simplest approach:
select id, ref_month,
(value +
(case when lag(ref_month) over (partition by id order by ref_month) > date_add(ref_month, interval -4 month)
then lag(value, 1) over (partition by id order by ref_month)
else 0
end) +
(case when lag(ref_month, 2) over (partition by id order by ref_month) > date_add(ref_month, interval -4 month)
then lag(value, 2) over (partition by id order by ref_month)
else 0
end) +
(case when lag(ref_month, 3) over (partition by id order by ref_month) > date_add(ref_month, interval -4 month)
then lag(value, 3) over (partition by id order by ref_month)
else 0
end) +
(case when lag(ref_month, 4) over (partition by id order by ref_month) > date_add(ref_month, interval -4 month)
then lag(value, 4) over (partition by id order by ref_month)
else 0
end)
) /
least(5, date_diff(min(ref_month) over (partition by id), ref_month))
from dummy_data;
The query is more complicated than the logic. It basically adds up the five most recent values dividing by 5. But it takes boundary conditions into affect (along with missing values).
来源:https://stackoverflow.com/questions/49883311/bigquery-moving-average-with-missing-values