BIGQUERY moving average with missing values

Deadly 提交于 2019-12-08 07:00:31

问题


I have the following data

with dummy_data as 
(
SELECT '2017-01-01' as ref_month, 18 as value, 1 as id
UNION ALL SELECT '2017-02-01' as ref_month, 20 as value, 1 as id
UNION ALL SELECT '2017-03-01' as ref_month, 22 as value, 1 as id
-- UNION ALL SELECT '2017-04-01' as ref_month, 28 as value, 1 as id
UNION ALL SELECT '2017-05-01' as ref_month, 30 as value, 1 as id
UNION ALL SELECT '2017-06-01' as ref_month, 37 as value, 1 as id
UNION ALL SELECT '2017-07-01' as ref_month, 42 as value, 1 as id
-- UNION ALL SELECT '2017-08-01' as ref_month, 55 as value, 1 as id
-- UNION ALL SELECT '2017-09-01' as ref_month, 49 as value, 1 as id
UNION ALL SELECT '2017-10-01' as ref_month, 51 as value, 1 as id
UNION ALL SELECT '2017-11-01' as ref_month, 57 as value, 1 as id
UNION ALL SELECT '2017-12-01' as ref_month, 56 as value, 1 as id
UNION ALL SELECT '2017-01-01' as ref_month, 18 as value, 2 as id
UNION ALL SELECT '2017-02-01' as ref_month, 20 as value, 2 as id
UNION ALL SELECT '2017-03-01' as ref_month, 22 as value, 2 as id
UNION ALL SELECT '2017-04-01' as ref_month, 28 as value, 2 as id
-- UNION ALL SELECT '2017-05-01' as ref_month, 30 as value, 2 as id
-- UNION ALL SELECT '2017-06-01' as ref_month, 37 as value, 2 as id
UNION ALL SELECT '2017-07-01' as ref_month, 42 as value, 2 as id
UNION ALL SELECT '2017-08-01' as ref_month, 55 as value, 2 as id
UNION ALL SELECT '2017-09-01' as ref_month, 49 as value, 2 as id
-- UNION ALL SELECT '2017-10-01' as ref_month, 51 as value, 2 as id
UNION ALL SELECT '2017-11-01' as ref_month, 57 as value, 2 as id
UNION ALL SELECT '2017-12-01' as ref_month, 56 as value, 2 as id
)

And i want to calculate the moving average for each id. I know you can do something like the following

select 
    id
  , ref_month
  , avg(value) over (partition by id order by ref_month ROWS BETWEEN 5 PRECEDING AND CURRENT ROW ) as moving_avg
from 
    dummy_data

but as you can see from my dummy data, there are some missing values. Any ideas on how to calculate the moving average easily when there are some missing values? I was thinking to calculate first a full date range

date_range AS
(
  SELECT reference_month
  FROM UNNEST(
      GENERATE_DATE_ARRAY(PARSE_DATE('%Y-%m-%d', (SELECT MIN(ref_month) FROM dummy_data)), PARSE_DATE('%Y-%m-%d', (SELECT MAX(ref_month) FROM dummy_data)), INTERVAL 1 MONTH)
  ) AS reference_month
)

and then do a cartesian products with the ids and then join back with my dummy data, but this seems an anti pattern. Any idea on how to do this optimally? Thanks

EDIT:

expected result: For id 1:

2017-01-01  18
2017-02-01  19
2017-03-01  20
2017-05-01  18
2017-06-01  21.8
2017-07-01  26.2
2017-10-01  26
2017-11-01  30
2017-12-01  32.8

For id 2:

2017-01-01  18
2017-02-01  19
2017-03-01  20
2017-04-01  22
2017-07-01  18.4
2017-08-01  25
2017-09-01  29.2
2017-11-01  40.6
2017-12-01  43.4

回答1:


Below is for BigQuery Standard SQL and actually works! :o)
It assumes that your ref_month is of DATE data type (if in your case you have it as STRING - still Okay - see note at the very bottom of my answer)

#standardSQL
SELECT 
  id, 
  ref_month,
  ROUND(SUM(value) OVER (rolling_six_days) / 
    (LAST_VALUE(month_pos) OVER (rolling_six_days) 
      - FIRST_VALUE(month_pos) OVER (rolling_six_days)
      + 1)
  ) AS correct_moving_avg
FROM (
  SELECT id, ref_month, value,
    DATE_DIFF(ref_month, '2016-01-01', MONTH) month_pos
  FROM dummy_data
)
WINDOW rolling_six_days AS 
  (PARTITION BY id ORDER BY month_pos RANGE BETWEEN 5 PRECEDING AND CURRENT ROW )

You can test / play with it using your example data as below

#standardSQL
WITH dummy_data AS (
  SELECT DATE '2017-01-01' AS ref_month, 18 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-02-01' AS ref_month, 20 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-03-01' AS ref_month, 22 AS value, 1 AS id
  -- UNION ALL SELECT DATE '2017-04-01' as ref_month, 28 as value, 1 as id
  UNION ALL SELECT DATE '2017-05-01' AS ref_month, 30 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-06-01' AS ref_month, 37 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-07-01' AS ref_month, 42 AS value, 1 AS id
  -- UNION ALL SELECT DATE '2017-08-01' as ref_month, 55 as value, 1 as id
  -- UNION ALL SELECT DATE '2017-09-01' as ref_month, 49 as value, 1 as id
  UNION ALL SELECT DATE '2017-10-01' AS ref_month, 51 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-11-01' AS ref_month, 57 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-12-01' AS ref_month, 56 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-01-01' AS ref_month, 18 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-02-01' AS ref_month, 20 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-03-01' AS ref_month, 22 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-04-01' AS ref_month, 28 AS value, 2 AS id
  -- UNION ALL SELECT DATE '2017-05-01' as ref_month, 30 as value, 2 as id
  -- UNION ALL SELECT DATE '2017-06-01' as ref_month, 37 as value, 2 as id
  UNION ALL SELECT DATE '2017-07-01' AS ref_month, 42 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-08-01' AS ref_month, 55 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-09-01' AS ref_month, 49 AS value, 2 AS id
  -- UNION ALL SELECT DATE '2017-10-01' as ref_month, 51 as value, 2 as id
  UNION ALL SELECT DATE '2017-11-01' AS ref_month, 57 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-12-01' AS ref_month, 56 AS value, 2 AS id
)
SELECT 
  id, 
  ref_month,
  ROUND(SUM(value) OVER (rolling_six_days) / 
    (LAST_VALUE(month_pos) OVER (rolling_six_days) 
      - FIRST_VALUE(month_pos) OVER (rolling_six_days)
      + 1)
  ) AS correct_moving_avg
FROM (
  SELECT id, ref_month, value,
    DATE_DIFF(ref_month, '2016-01-01', MONTH) month_pos
  FROM dummy_data
)
WINDOW rolling_six_days AS (PARTITION BY id ORDER BY month_pos RANGE BETWEEN 5 PRECEDING AND CURRENT ROW )
ORDER BY 1,2  

To help you in exploring logic - see below "expanded" version of above query - it has all even intermediate values propagated up to very outside select so you can see everything ...

#standardSQL
WITH dummy_data AS 
(
  SELECT DATE '2017-01-01' AS ref_month, 18 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-02-01' AS ref_month, 20 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-03-01' AS ref_month, 22 AS value, 1 AS id
  -- UNION ALL SELECT DATE '2017-04-01' as ref_month, 28 as value, 1 as id
  UNION ALL SELECT DATE '2017-05-01' AS ref_month, 30 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-06-01' AS ref_month, 37 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-07-01' AS ref_month, 42 AS value, 1 AS id
  -- UNION ALL SELECT DATE '2017-08-01' as ref_month, 55 as value, 1 as id
  -- UNION ALL SELECT DATE '2017-09-01' as ref_month, 49 as value, 1 as id
  UNION ALL SELECT DATE '2017-10-01' AS ref_month, 51 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-11-01' AS ref_month, 57 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-12-01' AS ref_month, 56 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-01-01' AS ref_month, 18 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-02-01' AS ref_month, 20 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-03-01' AS ref_month, 22 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-04-01' AS ref_month, 28 AS value, 2 AS id
  -- UNION ALL SELECT DATE '2017-05-01' as ref_month, 30 as value, 2 as id
  -- UNION ALL SELECT DATE '2017-06-01' as ref_month, 37 as value, 2 as id
  UNION ALL SELECT DATE '2017-07-01' AS ref_month, 42 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-08-01' AS ref_month, 55 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-09-01' AS ref_month, 49 AS value, 2 AS id
  -- UNION ALL SELECT DATE '2017-10-01' as ref_month, 51 as value, 2 as id
  UNION ALL SELECT DATE '2017-11-01' AS ref_month, 57 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-12-01' AS ref_month, 56 AS value, 2 AS id
)
SELECT 
  id,
  ref_month,
  value,
  moving_sum,
  first_month,
  last_month,
  ROUND(moving_sum / (last_month - first_month + 1)) AS correct_moving_avg,
  moving_avg
FROM (
  SELECT
    id,
    ref_month,
    value,
    SUM(value) OVER (rolling_six_days) AS moving_sum,
    FIRST_VALUE(month_pos) OVER (rolling_six_days) AS first_month,
    LAST_VALUE(month_pos) OVER (rolling_six_days) AS last_month,
    AVG(value) OVER (rolling_six_days) AS moving_avg
  FROM (
    SELECT 
      id, ref_month, value,
      DATE_DIFF(ref_month, '2016-01-01', MONTH) month_pos
    FROM dummy_data
  )
  WINDOW rolling_six_days AS 
    (PARTITION BY id ORDER BY month_pos RANGE BETWEEN 5 PRECEDING AND CURRENT ROW )
)
ORDER BY 1,2   

with result as

id  ref_month   value moving_sum    first_month last_month  correct_moving_avg  moving_avg   
1    2017-01-01 18    18            12          12          18.0                  18.0   
1    2017-02-01 20    38            12          13          19.0                  19.0   
1    2017-03-01 22    60            12          14          20.0                  20.0   
1    2017-05-01 30    90            12          16          18.0                  22.5   
1    2017-06-01 37    127           12          17          21.0                  25.4   
1    2017-07-01 42    151           13          18          25.0                  30.2   
1    2017-10-01 51    160           16          21          27.0                  40.0   
1    2017-11-01 57    187           17          22          31.0                  46.75  
1    2017-12-01 56    206           18          23          34.0                  51.5   
2    2017-01-01 18    18            12          12          18.0                  18.0   
2    2017-02-01 20    38            12          13          19.0                  19.0   
2    2017-03-01 22    60            12          14          20.0                  20.0   
2    2017-04-01 28    88            12          15          22.0                  22.0   
2    2017-07-01 42    112           13          18          19.0                  28.0   
2    2017-08-01 55    147           14          19          25.0                  36.75  
2    2017-09-01 49    174           15          20          29.0                  43.5   
2    2017-11-01 57    203           18          22          41.0                  50.75  
2    2017-12-01 56    259           18          23          43.0                  51.8     

hope this shows/explains you the approach

Note: if your ref_month field is of STRING` data tyoe you should slightly adjust line with DATE_DIFF - it should be as

DATE_DIFF(cast(ref_month as DATE), '2016-01-01', MONTH) month_pos

Note 2: I picked '2016-01-01' as a starting point for counting months - but you can pick any to make sure that it is less than your minimum date - for example '2000-01-01' will perfectly work too




回答2:


This should work:

with dummy_data as 
(
SELECT '2017-01-01' as ref_month, 18 as value, 1 as id
UNION ALL SELECT '2017-02-01' as ref_month, 20 as value, 1 as id
UNION ALL SELECT '2017-03-01' as ref_month, 22 as value, 1 as id
-- UNION ALL SELECT '2017-04-01' as ref_month, 28 as value, 1 as id
UNION ALL SELECT '2017-05-01' as ref_month, 30 as value, 1 as id
UNION ALL SELECT '2017-06-01' as ref_month, 37 as value, 1 as id
UNION ALL SELECT '2017-07-01' as ref_month, 42 as value, 1 as id
-- UNION ALL SELECT '2017-08-01' as ref_month, 55 as value, 1 as id
-- UNION ALL SELECT '2017-09-01' as ref_month, 49 as value, 1 as id
UNION ALL SELECT '2017-10-01' as ref_month, 51 as value, 1 as id
UNION ALL SELECT '2017-11-01' as ref_month, 57 as value, 1 as id
UNION ALL SELECT '2017-12-01' as ref_month, 56 as value, 1 as id
UNION ALL SELECT '2017-01-01' as ref_month, 18 as value, 2 as id
UNION ALL SELECT '2017-02-01' as ref_month, 20 as value, 2 as id
UNION ALL SELECT '2017-03-01' as ref_month, 22 as value, 2 as id
UNION ALL SELECT '2017-04-01' as ref_month, 28 as value, 2 as id
-- UNION ALL SELECT '2017-05-01' as ref_month, 30 as value, 2 as id
-- UNION ALL SELECT '2017-06-01' as ref_month, 37 as value, 2 as id
UNION ALL SELECT '2017-07-01' as ref_month, 42 as value, 2 as id
UNION ALL SELECT '2017-08-01' as ref_month, 55 as value, 2 as id
UNION ALL SELECT '2017-09-01' as ref_month, 49 as value, 2 as id
-- UNION ALL SELECT '2017-10-01' as ref_month, 51 as value, 2 as id
UNION ALL SELECT '2017-11-01' as ref_month, 57 as value, 2 as id
UNION ALL SELECT '2017-12-01' as ref_month, 56 as value, 2 as id
)


select 
    id
  , ref_month
  , avg(avg(value)) over (partition by id order by ref_month) as moving_avg
from 
    dummy_data
    group by id
  , ref_month



回答3:


If you want to treat the values as 0 and you want "5", then a series of lag() might be the simplest approach:

select id, ref_month,
       (value +
        (case when lag(ref_month) over (partition by id order by ref_month) > date_add(ref_month, interval -4 month)
              then lag(value, 1) over (partition by id order by ref_month)
              else 0
         end) +
        (case when lag(ref_month, 2) over (partition by id order by ref_month) > date_add(ref_month, interval -4 month)
              then lag(value, 2) over (partition by id order by ref_month)
              else 0
         end) +
        (case when lag(ref_month, 3) over (partition by id order by ref_month) > date_add(ref_month, interval -4 month)
              then lag(value, 3) over (partition by id order by ref_month)
              else 0
         end) +
        (case when lag(ref_month, 4) over (partition by id order by ref_month) > date_add(ref_month, interval -4 month)
              then lag(value, 4) over (partition by id order by ref_month)
              else 0
         end)
       ) / 
       least(5, date_diff(min(ref_month) over (partition by id), ref_month))
from dummy_data;

The query is more complicated than the logic. It basically adds up the five most recent values dividing by 5. But it takes boundary conditions into affect (along with missing values).



来源:https://stackoverflow.com/questions/49883311/bigquery-moving-average-with-missing-values

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!