问题
I have a table of orders that looks something like this:
WITH my_table_of_orders AS (
SELECT
1 AS order_id,
DATE(2019, 5, 12) AS date,
5 AS customer_id,
TRUE AS is_from_particular_store
UNION ALL SELECT
2 AS order_id,
DATE(2019, 5, 11) AS date,
5 AS customer_id,
TRUE AS is_from_particular_store
UNION ALL SELECT
3 AS order_id,
DATE(2019, 5, 11) AS date,
4 AS customer_id,
FALSE AS is_from_particular_store
)
My actual table contains ~59 million rows.
What I would like to do is essentially return one row, per order date, with a second column that represents what percentage of customers that placed orders in the past year (relative to the current row's date), placed an order with a particular store (where my fictitious is_from_particular_store
column comes in handy).
Ideally I could use the following query and not run into resource issues.. only problem is that you cannot use ORDER BY
when using DISTINCT
in an analytic function it seems, I get this Window ORDER BY is not allowed if DISTINCT is specified
:
SELECT
date,
last_year_customer_id_that_ordered_from_a_particular_store / last_year_customer_id_that_ordered AS number_i_want
FROM (
SELECT
date,
ROW_NUMBER() OVER (
PARTITION BY
date
) AS row_num,
COUNT(DISTINCT customer_id) OVER(
ORDER BY
UNIX_SECONDS(TIMESTAMP(date))
-- 31,536,000 = 365 days in seconds
RANGE BETWEEN 31536000 PRECEDING AND CURRENT ROW
) AS last_year_customer_id_that_ordered,
COUNT(DISTINCT IF(is_from_particular_store, customer_id, NULL)) OVER(
ORDER BY
UNIX_SECONDS(TIMESTAMP(date))
-- 31,536,000 = 365 days in seconds
RANGE BETWEEN 31536000 PRECEDING AND CURRENT ROW
) AS last_year_customer_id_that_ordered_from_a_particular_store,
FROM my_table_of_orders
)
WHERE
-- only return one row per date
row_num = 1
I then tried using ARRAY_AGG
and UNNEST
instead:
SELECT
date,
SAFE_DIVIDE((SELECT COUNT(DISTINCT customer_id)
FROM UNNEST(last_year_customer_id_that_ordered_from_a_particular_store) AS customer_id
), (SELECT COUNT(DISTINCT customer_id)
FROM UNNEST(last_year_customer_id_that_ordered) AS customer_id
)) AS number_i_want_to_calculate
FROM (
SELECT
date,
ROW_NUMBER() OVER (
PARTITION BY
date
) AS row_num,
ARRAY_AGG(customer_id) OVER(
ORDER BY
UNIX_SECONDS(TIMESTAMP(date))
-- 31,536,000 = 365 days in seconds
RANGE BETWEEN 31536000 PRECEDING AND CURRENT ROW
) AS last_year_customer_id_that_ordered,
ARRAY_AGG(IF(is_from_particular_store, customer_id, NULL)) OVER(
ORDER BY
UNIX_SECONDS(TIMESTAMP(date))
-- 31,536,000 = 365 days in seconds
RANGE BETWEEN 31536000 PRECEDING AND CURRENT ROW
) AS last_year_customer_id_that_ordered_from_a_particular_store,
FROM my_table_of_orders
)
WHERE
-- only return one row per date
row_num = 1
The only problem with this is that I get the following resource issue...
Resources exceeded during query execution: The query could not be executed in the allotted memory.
This question is incredibly similar https://stackoverflow.com/a/42567839/3902555 and suggests using ARRAY_AGG
+ UNNEST
but like I said this causes resource issues for me :(
Anyone know of a more resource efficient way to calculate the statistic I am after?
回答1:
Another totally refactored version (BigQuery Standard SQL)
#standardSQL
WITH temp AS (
SELECT DISTINCT DATE, customer_id, is_from_particular_store
FROM my_table_of_orders
)
SELECT a.date,
SAFE_DIVIDE(
COUNT(DISTINCT IF(b.is_from_particular_store, b.customer_id, NULL)),
COUNT(DISTINCT b.customer_id)
) AS number_i_want_to_calculate
FROM temp a
CROSS JOIN temp b
WHERE DATE_DIFF(a.date, b.date, YEAR) < 1
GROUP BY a.date
Alternative to above is using Approximate Aggregation as in below example
#standardSQL
WITH temp AS (
SELECT DISTINCT DATE, customer_id, is_from_particular_store
FROM my_table_of_orders
)
SELECT a.date,
SAFE_DIVIDE(
APPROX_COUNT_DISTINCT(IF(b.is_from_particular_store, b.customer_id, NULL)),
APPROX_COUNT_DISTINCT(b.customer_id)
) AS number_i_want_to_calculate
FROM temp a
CROSS JOIN temp b
WHERE DATE_DIFF(a.date, b.date, YEAR) < 1
GROUP BY a.date
回答2:
Below is for BigQuery Standard SQL
Try below little refactored version mostly based on first deduping customers on the same date and removing use ROW_NUMBER() which is usually heavy resource eater
Not able obviously to test on your real data , so don't know if this will be enough of further improvements still needed - so try and let us know
#standardSQL
SELECT DISTINCT DATE,
SAFE_DIVIDE(
(SELECT COUNT(DISTINCT customer_id) FROM UNNEST(last_year_customer_id_that_ordered_from_a_particular_store) AS customer_id),
(SELECT COUNT(DISTINCT customer_id) FROM UNNEST(last_year_customer_id_that_ordered) AS customer_id)
) AS number_i_want_to_calculate
FROM (
SELECT DATE,
ARRAY_AGG(customer_id) OVER(win) AS last_year_customer_id_that_ordered,
ARRAY_AGG(IF(is_from_particular_store, customer_id, NULL)) OVER(win) AS last_year_customer_id_that_ordered_from_a_particular_store,
FROM (
SELECT DISTINCT DATE, customer_id, is_from_particular_store
FROM my_table_of_orders
)
WINDOW win AS (ORDER BY UNIX_SECONDS(TIMESTAMP(DATE)) RANGE BETWEEN 31536000 PRECEDING AND CURRENT ROW)
)
来源:https://stackoverflow.com/questions/62582377/is-there-a-way-to-use-order-by-clause-in-count-aggregate-analytic-function-if-n