Group data by the change of grouping column value in order

后端 未结 5 881
一个人的身影
一个人的身影 2020-12-14 03:57

With the following data

create table #ph (product int, [date] date, price int)
insert into #ph select 1, \'20120101\', 1
insert into #ph select 1, \'20120102         


        
相关标签:
5条回答
  • 2020-12-14 04:01

    I'm new to this forum so hope my contribution is helpful.

    If you really don't want to use a CTE (although I think thats probably the best approach) you can get a solution using set based code. You will need to test the performance of this code!.

    I have added in an extra temp table so that I can use a unique identifier for each record but I suspect you will already have this column in you source table. So heres the temp table.

        If Exists (SELECT Name FROM tempdb.sys.tables WHERE name LIKE '#phwithId%')
            DROP TABLE #phwithId    
    
        CREATE TABLE #phwithId
        (
            SaleId INT
            , ProductID INT
            , Price Money
            , SaleDate Date 
        )
        INSERT INTO #phwithId SELECT row_number() over(partition by product order by [date] asc) as SalesId, Product, Price, Date FROM ph 
    

    Now the main body of the Select statement

        SELECT 
            productId 
            , date_from
            , date_to
            , Price
        FROM
            (   
                SELECT 
                    dfr.ProductId
                    , ROW_NUMBER() OVER (PARTITION BY ProductId ORDER BY ChangeDate) AS rowno1          
                    , ChangeDate AS date_from
                    , dfr.Price
                FROM
                    (       
                        SELECT
                            sl1.ProductId AS ProductId
                            , sl1.SaleDate AS ChangeDate
                            , sl1.price
                        FROM
                            #phwithId sl1
                        LEFT JOIN
                            #phwithId sl2
                            ON sl1.SaleId = sl2.SaleId + 1
                        WHERE
                            sl1.Price <> sl2.Price OR sl2.Price IS NULL
                    ) dfr
            ) da1
        LEFT JOIN
            (   
                SELECT 
                    ROW_NUMBER() OVER (PARTITION BY ProductId ORDER BY ChangeDate) AS rowno2
                    , ChangeDate AS date_to     
                FROM
                    (   
                        SELECT 
                            sl1.ProductId
                            , sl1.SaleDate AS ChangeDate
                        FROM
                            #phwithId sl1
                        LEFT JOIN
                            #phwithId sl3
                            ON sl1.SaleId = sl3.SaleId - 1  
                        WHERE
                            sl1.Price <> sl3.Price OR sl3.Price IS NULL         
                    ) dto
    
            ) da2 
            ON da1.rowno1 = da2.rowno2  
    

    By binding the data source offset by 1 record (+or-) we can identify when the price buckets change and then its just a matter of getting the start and end dates for the buckets back into a single record.

    All a bit fiddly and I'm not sure its going to give better performance but I enjoyed the challenge.

    0 讨论(0)
  • 2020-12-14 04:07

    There's a (more or less) known technique of solving this kind of problem, involving two ROW_NUMBER() calls, like this:

    WITH marked AS (
      SELECT
        *,
        grp = ROW_NUMBER() OVER (PARTITION BY product        ORDER BY date)
            - ROW_NUMBER() OVER (PARTITION BY product, price ORDER BY date)
      FROM #ph
    )
    SELECT
      product,
      date_from = MIN(date),
      date_to   = MAX(date),
      price
    FROM marked
    GROUP BY
      product,
      price,
      grp
    ORDER BY
      product,
      MIN(date)
    

    Output:

    product  date_from   date_to        price 
    -------  ----------  -------------  ----- 
    1        2012-01-01  2012-01-04     1     
    1        2012-01-05  2012-01-08     2     
    1        2012-01-09  2012-01-12     1     
    
    0 讨论(0)
  • 2020-12-14 04:07
    WITH marked AS (
      SELECT
        *,
      case
       when (lag(price,1,'') over (partition by product order by date_from)) = price
       then 0 else 1
      end is_price_change
      FROM #ph
    ),
    marked_as_group AS
    ( SELECT m.*,
           SUM(is_price_change) over (PARTITION BY product order by date_from ROWS 
          BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS price_change_group
      FROM marked m
    ),
    SELECT
      product,
      date_from = MIN(date_from),
      date_to   = MAX(date_to),
      price = MIN(price)
    FROM marked_as_group 
    GROUP BY
      product,
      price_change_group
    ORDER BY
      product,
      date_to
    
    0 讨论(0)
  • 2020-12-14 04:15

    One solution I have come up with which is relatively "clean" is:

    ;with cte_sort (product, [date], price, [row])
    as
        (select product, [date], price, row_number() over(partition by product order by [date] asc) as row
         from #ph)
    
    select a.product, a.[date] as date_from, c.[date] as date_to, a.price 
    from cte_sort a
    left outer join cte_sort b on a.product = b.product and (a.row+1) = b.row and a.price = b.price
    outer apply (select top 1 [date] from cte_sort z where z.product = a.product and z.row > a.row order by z.row) c
    where b.row is null
    order by a.[date] 
    

    I used a CTE with row_number because you then don't need to worry about whether any dates are missing if you use functions like dateadd. You obviously only need the outer apply if you want to have the date_to column (which I do).

    This solution does solve my problem, I am however having a slight issue getting it to perform as quickly as I'd like on my table of 5 million rows.

    0 讨论(0)
  • 2020-12-14 04:24
    Create function [dbo].[AF_TableColumns](@table_name nvarchar(55))
    returns nvarchar(4000) as
    begin
    declare @str nvarchar(4000)
        select @str = cast(rtrim(ltrim(column_name)) as nvarchar(500)) + coalesce('         ' + @str , '            ') 
        from information_schema.columns
        where table_name = @table_name
        group by table_name, column_name, ordinal_position 
        order by ordinal_position DESC
    return @str
    end
    
    --select dbo.AF_TableColumns('YourTable') Select * from YourTable
    
    0 讨论(0)
提交回复
热议问题