Group All Related Records in Many to Many Relationship, SQL graph connected components

后端 未结 6 888
遥遥无期
遥遥无期 2020-12-08 21:48

Hopefully I\'m missing a simple solution to this.

I have two tables. One contains a list of companies. The second contains a list of publishers. The mapping between

6条回答
  •  星月不相逢
    2020-12-08 22:32

    Bit late to the challenge, and since SQLFiddle seems to be down ATM I'll have to guess your data-structures. Nevertheless, it seemed like a fun challenge (and it was =) so here's what I made from it :

    Setup:

    IF OBJECT_ID('t_link') IS NOT NULL DROP TABLE t_link
    IF OBJECT_ID('t_company') IS NOT NULL DROP TABLE t_company
    IF OBJECT_ID('t_publisher') IS NOT NULL DROP TABLE t_publisher
    IF OBJECT_ID('tempdb..#link_A') IS NOT NULL DROP TABLE #link_A
    IF OBJECT_ID('tempdb..#link_B') IS NOT NULL DROP TABLE #link_B
    GO
    
    CREATE TABLE t_company ( company_id     int IDENTITY(1, 1) NOT NULL PRIMARY KEY,
                             company_name   varchar(100) NOT NULL)
    
    GO 
    
    CREATE TABLE t_publisher (publisher_id     int IDENTITY(1, 1) NOT NULL PRIMARY KEY,
                              publisher_name   varchar(100) NOT NULL)
    
    CREATE TABLE t_link (company_id int NOT NULL FOREIGN KEY (company_id) REFERENCES t_company (company_id),
                         publisher_id int NOT NULL FOREIGN KEY (publisher_id) REFERENCES t_publisher (publisher_id),
                                    PRIMARY KEY (company_id, publisher_id),
                         group_id int NULL
                                 )
    GO
    
    -- example content
    
    
    -- ROW   GROUPID     Company     Publisher
    --1     1           A           Y
    --2     1           A           X
    --3     1           B           Y
    --4     1           B           Z
    --5     2           C           W
    --6     2           C           P
    --7     2           D           W
    
    
    INSERT t_company (company_name) VALUES ('A'), ('B'), ('C'), ('D')
    INSERT t_publisher (publisher_name) VALUES ('X'), ('Y'), ('Z'), ('W'), ('P')
    
    INSERT t_link (company_id, publisher_id)
    SELECT company_id, publisher_id
      FROM t_company, t_publisher
     WHERE (company_name = 'A' AND publisher_name = 'Y')
        OR (company_name = 'A' AND publisher_name = 'X')
        OR (company_name = 'B' AND publisher_name = 'Y')
        OR (company_name = 'B' AND publisher_name = 'Z')
        OR (company_name = 'C' AND publisher_name = 'W')
        OR (company_name = 'C' AND publisher_name = 'P')
        OR (company_name = 'D' AND publisher_name = 'W')
    
    
    
    
    GO
    
    /*
    -- volume testing
    
    TRUNCATE TABLE t_link
    DELETE t_company
    DELETE t_publisher
    
    
    DECLARE @company_count   int = 1000,
            @publisher_count int = 450,
            @links_count     int = 800
    
    
    INSERT t_company (company_name)
    SELECT company_name    = Convert(varchar(100), NewID())
      FROM master.dbo.fn_int_list(1, @company_count) 
    
    UPDATE STATISTICS t_company
    
    INSERT t_publisher (publisher_name)
    SELECT publisher_name  = Convert(varchar(100), NewID())
      FROM master.dbo.fn_int_list(1, @publisher_count) 
    
    UPDATE STATISTICS t_publisher
    
    -- Random links between the companies & publishers
    
    DECLARE @count int
    SELECT @count = 0
    
    WHILE @count < @links_count
        BEGIN
    
            SELECT TOP 30 PERCENT row_id = IDENTITY(int, 1, 1), company_id = company_id + 0
              INTO #link_A
              FROM t_company
             ORDER BY NewID()
    
            SELECT TOP 30 PERCENT row_id = IDENTITY(int, 1, 1), publisher_id = publisher_id + 0
              INTO #link_B
              FROM t_publisher
             ORDER BY NewID()
    
            INSERT TOP (@links_count - @count) t_link (company_id, publisher_id)
            SELECT A.company_id,
                   B.publisher_id
              FROM #link_A A
              JOIN #link_B B
                ON A.row_id = B.row_id
             WHERE NOT EXISTS ( SELECT *
                                  FROM t_link old
                                 WHERE old.company_id   = A.company_id
                                   AND old.publisher_id = B.publisher_id)
    
            SELECT @count = @count + @@ROWCOUNT
    
            DROP TABLE #link_A
            DROP TABLE #link_B    
        END
    
    */
    

    Actual grouping:

    IF OBJECT_ID('tempdb..#links') IS NOT NULL DROP TABLE #links
    GO
    
    -- apply grouping
    
    -- init
    SELECT row_id = IDENTITY(int, 1, 1), 
           company_id,
           publisher_id,
           group_id = 0
      INTO #links
      FROM t_link
    
    -- don't see an index that would be actually helpful here right-away, using row_id to avoid HEAP
    CREATE CLUSTERED INDEX idx0 ON #links (row_id)
    --CREATE INDEX idx1 ON #links (company_id)   
    --CREATE INDEX idx2 ON #links (publisher_id)
    
    UPDATE #links
       SET group_id = row_id
    
    
    -- start grouping
    WHILE @@ROWCOUNT > 0
        BEGIN  
            UPDATE #links
               SET group_id = new_group_id
              FROM #links upd
              CROSS APPLY (SELECT new_group_id = Min(group_id)
                             FROM #links new
                            WHERE new.company_id   = upd.company_id
                               OR new.publisher_id = upd.publisher_id 
                                         ) x
            WHERE upd.group_id > new_group_id
    
            -- select * from #links
        END
    
    
    -- remove 'holes'
    UPDATE #links
       SET group_id = (SELECT COUNT(DISTINCT o.group_id) 
                              FROM #links o
                             WHERE o.group_id <= upd.group_id)
      FROM #links upd
    
    GO
    
    UPDATE t_link
       SET group_id = new.group_id
      FROM t_link upd
      LEFT OUTER JOIN #links new
                   ON new.company_id = upd.company_id
                  AND new.publisher_id = upd.publisher_id
    
    GO    
    SELECT row = ROW_NUMBER() OVER (ORDER BY group_id, company_name, publisher_name),
           l.group_id,
           c.company_name, -- c.company_id,
           p.publisher_name -- , p.publisher_id
     from t_link l
     JOIN t_company c
       ON l.company_id = c.company_id
     JOIN t_publisher p 
       ON p.publisher_id = l.publisher_id
     ORDER BY 1
    

    At first sight this approach hasn't been tried yet by anyone else, interesting to see how this can be done in a variety of ways... (preferred not to read them upfront as it would spoil the puzzle =)

    Results look as expected (as far as I understand the requirements and the example) and performance isn't too shabby either although there is no real indication on the amount of records this should work on; not sure how it would scale but don't expect too many problems either...

提交回复
热议问题