sql >> データベース >  >> RDS >> Sqlserver

T-SQLを使用したファジーマッチング

    SQL Serverがファジーマッチングを実行するために提供するものは、かなり不格好であることがわかりました。レーベンシュタイン距離アルゴリズムといくつかの重み付けを使用して、自分のCLR関数で本当に幸運に恵まれました。次に、そのアルゴリズムを使用して、GetSimilarityScoreというUDFを作成しました。このUDFは、2つの文字列を受け取り、0.0〜1.0のスコアを返します。 1.0に近いほど、一致は良好です。次に、しきい値が0.8以上のクエリを実行して、最も可能性の高い一致を取得します。このようなもの:

    if object_id('tempdb..#similar') is not null drop table #similar
    select a.id, (
        select top 1 x.id
       from MyTable x
       where x.id <> a.id
       order by dbo.GetSimilarityScore(a.MyField, x.MyField) desc
    ) as MostSimilarId
    into #similar
    from MyTable a
    
    select *, dbo.GetSimilarityScore(a.MyField, c.MyField)
    from MyTable a
    join #similar b on a.id = b.id
    join MyTable c on b.MostSimilarId = c.id
    

    本当に大きなテーブルではそれをしないでください。遅いプロセスです。

    CLR UDFは次のとおりです:

    ''' <summary>
    ''' Compute the distance between two strings.
    ''' </summary>
    ''' <param name="s1">The first of the two strings.</param>
    ''' <param name="s2">The second of the two strings.</param>
    ''' <returns>The Levenshtein cost.</returns>
    <Microsoft.SqlServer.Server.SqlFunction()> _
    Public Shared Function ComputeLevenstheinDistance(ByVal string1 As SqlString, ByVal string2 As SqlString) As SqlInt32
        If string1.IsNull OrElse string2.IsNull Then Return SqlInt32.Null
        Dim s1 As String = string1.Value
        Dim s2 As String = string2.Value
    
        Dim n As Integer = s1.Length
        Dim m As Integer = s2.Length
        Dim d As Integer(,) = New Integer(n, m) {}
    
        ' Step 1
        If n = 0 Then Return m
        If m = 0 Then Return n
    
        ' Step 2
        For i As Integer = 0 To n
            d(i, 0) = i
        Next
    
        For j As Integer = 0 To m
            d(0, j) = j
        Next
    
        ' Step 3
        For i As Integer = 1 To n
            'Step 4
            For j As Integer = 1 To m
                ' Step 5
                Dim cost As Integer = If((s2(j - 1) = s1(i - 1)), 0, 1)
    
                ' Step 6
                d(i, j) = Math.Min(Math.Min(d(i - 1, j) + 1, d(i, j - 1) + 1), d(i - 1, j - 1) + cost)
            Next
        Next
        ' Step 7
        Return d(n, m)
    End Function
    
    ''' <summary>
    ''' Returns a score between 0.0-1.0 indicating how closely two strings match.  1.0 is a 100%
    ''' T-SQL equality match, and the score goes down from there towards 0.0 for less similar strings.
    ''' </summary>
    <Microsoft.SqlServer.Server.SqlFunction()> _
    Public Shared Function GetSimilarityScore(string1 As SqlString, string2 As SqlString) As SqlDouble
        If string1.IsNull OrElse string2.IsNull Then Return SqlInt32.Null
    
        Dim s1 As String = string1.Value.ToUpper().TrimEnd(" "c)
        Dim s2 As String = string2.Value.ToUpper().TrimEnd(" "c)
        If s1 = s2 Then Return 1.0F ' At this point, T-SQL would consider them the same, so I will too
    
        Dim flatLevScore As Double = InternalGetSimilarityScore(s1, s2)
    
        Dim letterS1 As String = GetLetterSimilarityString(s1)
        Dim letterS2 As String = GetLetterSimilarityString(s2)
        Dim letterScore As Double = InternalGetSimilarityScore(letterS1, letterS2)
    
        'Dim wordS1 As String = GetWordSimilarityString(s1)
        'Dim wordS2 As String = GetWordSimilarityString(s2)
        'Dim wordScore As Double = InternalGetSimilarityScore(wordS1, wordS2)
    
        If flatLevScore = 1.0F AndAlso letterScore = 1.0F Then Return 1.0F
        If flatLevScore = 0.0F AndAlso letterScore = 0.0F Then Return 0.0F
    
        ' Return weighted result
        Return (flatLevScore * 0.2F) + (letterScore * 0.8F)
    End Function
    
    Private Shared Function InternalGetSimilarityScore(s1 As String, s2 As String) As Double
        Dim dist As SqlInt32 = ComputeLevenstheinDistance(s1, s2)
        Dim maxLen As Integer = If(s1.Length > s2.Length, s1.Length, s2.Length)
        If maxLen = 0 Then Return 1.0F
        Return 1.0F - Convert.ToDouble(dist.Value) / Convert.ToDouble(maxLen)
    End Function
    
    ''' <summary>
    ''' Sorts all the alpha numeric characters in the string in alphabetical order
    ''' and removes everything else.
    ''' </summary>
    Private Shared Function GetLetterSimilarityString(s1 As String) As String
        Dim allChars = If(s1, "").ToUpper().ToCharArray()
        Array.Sort(allChars)
        Dim result As New StringBuilder()
        For Each ch As Char In allChars
            If Char.IsLetterOrDigit(ch) Then
                result.Append(ch)
            End If
        Next
        Return result.ToString()
    End Function
    
    ''' <summary>
    ''' Removes all non-alpha numeric characters and then sorts
    ''' the words in alphabetical order.
    ''' </summary>
    Private Shared Function GetWordSimilarityString(s1 As String) As String
        Dim words As New List(Of String)()
        Dim curWord As StringBuilder = Nothing
        For Each ch As Char In If(s1, "").ToUpper()
            If Char.IsLetterOrDigit(ch) Then
                If curWord Is Nothing Then
                    curWord = New StringBuilder()
                End If
                curWord.Append(ch)
            Else
                If curWord IsNot Nothing Then
                    words.Add(curWord.ToString())
                    curWord = Nothing
                End If
            End If
        Next
        If curWord IsNot Nothing Then
            words.Add(curWord.ToString())
        End If
    
        words.Sort(StringComparer.OrdinalIgnoreCase)
        Return String.Join(" ", words.ToArray())
    End Function
    


    1. エンベロープを取得します。つまり、重複する期間

    2. カスタム型配列をPostgres関数に渡す方法

    3. Oracle SQLで特定の文字までの部分文字列を選択するにはどうすればよいですか?

    4. テーブル式の基礎、パート6 –再帰的CTE