Is it possible to compare rows for similar data in SQL server

和自甴很熟 提交于 2019-12-02 21:12:12

问题


Is it possible to compare rows for similar data in SQL Server? I have a company name column in a table where company names could be somewhat similar. Here is an example of the different 8 values that represent the same 4 companies:

ANDORRA WOODS   
ANDORRA WOODS HEALTHCARE CENTER
ABC HEALTHCARE, JOB #31181
ABC HEALTHCARE, JOB #31251
ACTION SERVICE  SALES, A SUBSIDIARY OF SINGER EQUIPMENT
ACTION SERVICE  SALES, A SUBSIDIARY OF SINGER EQUIPMENT COMPANY
APEX SYSTEMS
APEX SYSTEMS, INC

The way I clean it right now is using Google refine where I can identify clusters of similar data values and make them all as one. Using this example I only need 4 names not 8 so I need to replace similar ones with only one since I will be assigning indexes to those names later on. Any help is greatly appreciated.


回答1:


SSIS/Data Tools has a Fuzzy Grouping transformation that is very helpful in situations like this. It doesn't actually group your data, rather it gives you similarity scores that you can use to determine when items should be grouped together.

Plenty of tutorials out there, here's one: The Fuzzy Grouping Transformation




回答2:


I have a couple UDF's I converted from some VB code some time ago that takes in 2 varchar() and returns an int between 0 and 100 (0= not similar, 100= same) if your interested.

-- Description: Removes any special characters from a string
CREATE FUNCTION [dbo].[SimReplaceSpecial] 
(
    -- Add the parameters for the function here
    @String varchar(max)
)
RETURNS varchar(max)
AS
BEGIN
    -- Declare the return variable here
    DECLARE @Result varchar(max) = ''

    -- Add the T-SQL statements to compute the return value here
    DECLARE @Pos int = 1
    DECLARE @Asc int
    DECLARE @WorkingString varchar(max)

    SET @WorkingString = upper(@String)

    WHILE @Pos <= LEN(@WorkingString)
    BEGIN
        SET @Asc = ascii(substring(@WorkingString,@Pos,1))
        If (@Asc >= 48 And @Asc <= 57) Or (@Asc >= 65 And @Asc <= 90)
                SET @Result = @Result + Char(@Asc)

        SET @Pos = @Pos + 1
        --IF @Pos + 1 > len(@String)
        --  BREAK
        --ELSE
        --  CONTINUE
    END

    -- Return the result of the function
    RETURN @Result
END

-- Description: DO NOT CALL DIRECTLY - Used by the Similar function
--   Finds longest common substring (other than single
--   characters) in String1 and String2, then recursively
--   finds longest common substring in left-hand
--   portion and right-hand portion. Updates the
--   cumulative score.
CREATE FUNCTION [dbo].[SimFindCommon] 
(
    -- Add the parameters for the function here
    @String1 varchar(max),
    @String2 varchar(max),
    @Score int
)
RETURNS int
AS
BEGIN
    -- Declare the return variable here
    --DECLARE @Result int

    DECLARE @Longest Int = 0
    DECLARE @StartPos1 Int = 0
    DECLARE @StartPos2 Int = 0
    DECLARE @J Int = 0

    DECLARE @HoldStr varchar(max)
    DECLARE @TestStr varchar(max)
    DECLARE @LeftStr1 varchar(max) = ''
    DECLARE @LeftStr2 varchar(max) = ''
    DECLARE @RightStr1 varchar(max) = ''
    DECLARE @RightStr2 varchar(max) = ''

    -- Add the T-SQL statements to compute the return value here
    SET @HoldStr = @String2
    WHILE LEN(@HoldStr) > @Longest 
    BEGIN
        SET @TestStr = @HoldStr
        WHILE LEN(@TestStr) > 1
        BEGIN
                SET @J = CHARINDEX(@TestStr,@String1)
            If @J > 0
            BEGIN
                --Test string is sub-set of the other string
                If Len(@TestStr) > @Longest 
                BEGIN
                    --Test string is longer than previous
                    --longest. Store its length and position.
                    SET @Longest = Len(@TestStr)
                    SET @StartPos1 = @J
                    SET @StartPos2 = CHARINDEX(@TestStr,@String2)
                END
                --No point in going further with this string
                BREAK
            END
            ELSE
                --Test string is not a sub-set of the other
                --string. Discard final character of test
                --string and try again.
                SET @TestStr = Left(@TestStr, LEN(@TestStr) - 1)
        END 
        --Now discard first char of test string and
        --repeat the process.
        SET @HoldStr = Right(@HoldStr, LEN(@HoldStr) - 1)
    END 

    --Update the cumulative score with the length of
    --the common sub-string.
    SET @Score = @Score + @Longest

    --We now have the longest common sub-string, so we
    --can isolate the sub-strings to the left and right
    --of it.

    If @StartPos1 > 3 And @StartPos2 > 3 
    BEGIN
        SET @LeftStr1 = Left(@String1, @StartPos1 - 1)
        SET @LeftStr2 = Left(@String2, @StartPos2 - 1)
        If RTRIM(LTRIM(@LeftStr1)) <> '' And RTRIM(LTRIM(@LeftStr2)) <> ''
        BEGIN
            --Get longest common substring from left strings
            SET @Score = dbo.SimFindCommon(@LeftStr1, @LeftStr2,@Score)
        END
    END
    ELSE
    BEGIN
        SET @LeftStr1 = ''
        SET @LeftStr2 = ''
    END
    If @Longest > 0
    BEGIN
        SET @RightStr1 = substring(@String1, @StartPos1 + @Longest, LEN(@String1))
        SET @RightStr2 = substring(@String2, @StartPos2 + @Longest, LEN(@String2))

        If RTRIM(LTRIM(@RightStr1)) <> '' And RTRIM(LTRIM(@RightStr2)) <> ''
        BEGIN
            --Get longest common substring from right strings
            SET @Score = dbo.SimFindCommon(@RightStr1, @RightStr2,@Score)
        END
    END
    ELSE
    BEGIN
        SET @RightStr1 = ''
        SET @RightStr2 = ''
    END

    -- Return the result of the function
    RETURN @Score
END

-- Description: Compares two not-empty strings regardless of case.
--  Returns a numeric indication of their similarity 
--  (0 = not at all similar, 100 = identical)
CREATE FUNCTION [dbo].[Similar] 
(
    -- Add the parameters for the function here
    @String1 varchar(max),
    @String2 varchar(max)
)
RETURNS int
AS
BEGIN
    -- Declare the return variable here
    DECLARE @Result int
    DECLARE @WorkingString1 varchar(max)
    DECLARE @WorkingString2 varchar(max)

    -- Add the T-SQL statements to compute the return value here
    if isnull(@String1,'') = '' or isnull(@String2,'') = ''
        SET @Result = 0
    ELSE 
    BEGIN
        --Convert each string to simplest form (letters
        --and digits only, all upper case)
        SET @WorkingString1 = dbo.SimReplaceSpecial(@String1)
        SET @WorkingString2 = dbo.SimReplaceSpecial(@String2)

        If RTRIM(LTRIM(@WorkingString1)) = '' Or RTRIM(LTRIM(@WorkingString2)) = ''
        BEGIN
            --One or both of the strings is now empty
            SET @Result =  0
        END
        ELSE
        BEGIN
            If @WorkingString1 = @WorkingString2
            BEGIN
                --Strings are identical
                SET @Result =  100
            END
            ELSE
            BEGIN
                --Find all common sub-strings
                SET @Result = dbo.SimFindCommon(@WorkingString1, @WorkingString2,0)

                --We now have the cumulative score. Return this
                --as a percent of the maximum score. The maximum
                --score is the average length of the two strings.
                SET @Result = @Result * 200 / (Len(@WorkingString1) + Len(@WorkingString2))
            END
        END
    END

    -- Return the result of the function
    RETURN @Result

END



--Usage--------------------------------------------------------------------
--Call the "Similar" Function only
SELECT dbo.Similar('ANDORRA WOODS','ANDORRA WOODS HEALTHCARE CENTER')
--Result = 60
SELECT dbo.Similar('ABC HEALTHCARE, JOB #31181','ABC HEALTHCARE, JOB #31251')
--Result = 85
SELECT dbo.Similar('ACTION SERVICE  SALES, A SUBSIDIARY OF SINGER EQUIPMENT','ACTION SERVICE  SALES, A SUBSIDIARY OF SINGER EQUIPMENT COMPANY')
--Result = 92
SELECT dbo.Similar('APEX SYSTEMS','APEX SYSTEMS, INC')
--Result = 88


来源:https://stackoverflow.com/questions/18836884/is-it-possible-to-compare-rows-for-similar-data-in-sql-server

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!