Highlight (not delete) repeat sentences or phrases

前端 未结 3 844
故里飘歌
故里飘歌 2020-12-28 10:03

I am getting the impression that this is not possible in word but I figure if you are looking for any 3-4 words that come in the same sequence anywhere in a very long paper

相关标签:
3条回答
  • 2020-12-28 10:46

    I did not use my own DAWG suggestion, and I am still interested in seeing if someone else has a way to do this, but I was able to come up with this:

    Option Explicit
    
    Sub test()
    Dim ABC As Scripting.Dictionary
    Dim v As Range
    Dim n As Integer
        n = 5
        Set ABC = FindRepeatingWordChains(n, ActiveDocument)
        ' This is a dictionary of word ranges (not the same as an Excel range) that contains the listing of each word chain/phrase of length n (5 from the above example).
        ' Loop through this collection to make your selections/highlights/whatever you want to do.
        If Not ABC Is Nothing Then
            For Each v In ABC
                v.Font.Color = wdColorRed
            Next v
        End If
    End Sub
    
    ' This is where the real code begins.
    Function FindRepeatingWordChains(ChainLenth As Integer, DocToCheck As Document) As Scripting.Dictionary
    Dim DictWords As New Scripting.Dictionary, DictMatches As New Scripting.Dictionary
    Dim sChain As String
    Dim CurWord As Range
    Dim MatchCount As Integer
    Dim i As Integer
    
        MatchCount = 0
    
        For Each CurWord In DocToCheck.Words
            ' Make sure there are enough remaining words in our document to handle a chain of the length specified.
            If Not CurWord.Next(wdWord, ChainLenth - 1) Is Nothing Then
                ' Check for non-printing characters in the first/last word of the chain.
                ' This code will read a vbCr, etc. as a word, which is probably not desired.
                ' However, this check does not exclude these 'words' inside the chain, but it can be modified.
                If CurWord <> vbCr And CurWord <> vbNewLine And CurWord <> vbCrLf And CurWord <> vbLf And CurWord <> vbTab And _
                    CurWord.Next(wdWord, ChainLenth - 1) <> vbCr And CurWord.Next(wdWord, ChainLenth - 1) <> vbNewLine And _
                    CurWord.Next(wdWord, ChainLenth - 1) <> vbCrLf And CurWord.Next(wdWord, ChainLenth - 1) <> vbLf And _
                    CurWord.Next(wdWord, ChainLenth - 1) <> vbTab Then
                    sChain = CurWord
                    For i = 1 To ChainLenth - 1
                        ' Add each word from the current word through the next ChainLength # of words to a temporary string.
                        sChain = sChain & " " & CurWord.Next(wdWord, i)
                    Next i
    
                    ' If we already have our temporary string stored in the dictionary, then we have a match, assign the word range to the returned dictionary.
                    ' If not, then add it to the dictionary and increment our index.
                    If DictWords.Exists(sChain) Then
                        MatchCount = MatchCount + 1
                        DictMatches.Add DocToCheck.Range(CurWord.Start, CurWord.Next(wdWord, ChainLenth - 1).End), MatchCount
                    Else
                        DictWords.Add sChain, sChain
                    End If
                End If
            End If
        Next CurWord
    
        ' If we found any matching results, then return that list, otherwise return nothing (to be caught by the calling function).
        If DictMatches.Count > 0 Then
            Set FindRepeatingWordChains = DictMatches
        Else
            Set FindRepeatingWordChains = Nothing
        End If
    
    End Function
    

    I have tested this on a 258 page document (TheStory.txt) from this source, and it ran in just a few minutes.

    See the test() sub for usage.

    You will need to reference the Microsoft Scripting Runtime to use the Scripting.Dictionary objects. If that is undesirable, small modifications can be made to use Collections instead, but I prefer the Dictionary as it has the useful .Exists() method.

    0 讨论(0)
  • 2020-12-28 10:50

    To highlight all duplicate sentences, you can also use ActiveDocument.Sentences(i). Here is an example

    LOGIC

    1) Get all the sentences from the word document in an array

    2) Sort the array

    3) Extract Duplicates

    4) Highlight duplicates

    CODE

    Option Explicit
    
    Sub Sample()
        Dim MyArray() As String
        Dim n As Long, i As Long
        Dim Col As New Collection
        Dim itm
    
        n = 0
        '~~> Get all the sentences from the word document in an array
        For i = 1 To ActiveDocument.Sentences.Count
            n = n + 1
            ReDim Preserve MyArray(n)
            MyArray(n) = Trim(ActiveDocument.Sentences(i).Text)
        Next
    
        '~~> Sort the array
        SortArray MyArray, 0, UBound(MyArray)
    
        '~~> Extract Duplicates
        For i = 1 To UBound(MyArray)
            If i = UBound(MyArray) Then Exit For
            If InStr(1, MyArray(i + 1), MyArray(i), vbTextCompare) Then
                On Error Resume Next
                Col.Add MyArray(i), """" & MyArray(i) & """"
                On Error GoTo 0
            End If
        Next i
    
        '~~> Highlight duplicates
        For Each itm In Col
            Selection.Find.ClearFormatting
            Selection.HomeKey wdStory, wdMove
            Selection.Find.Execute itm
            Do Until Selection.Find.Found = False
                Selection.Range.HighlightColorIndex = wdPink
                Selection.Find.Execute
            Loop
        Next
    End Sub
    
    '~~> Sort the array
    Public Sub SortArray(vArray As Variant, i As Long, j As Long)
      Dim tmp As Variant, tmpSwap As Variant
      Dim ii As Long, jj As Long
    
      ii = i: jj = j: tmp = vArray((i + j) \ 2)
    
      While (ii <= jj)
         While (vArray(ii) < tmp And ii < j)
            ii = ii + 1
         Wend
         While (tmp < vArray(jj) And jj > i)
            jj = jj - 1
         Wend
         If (ii <= jj) Then
            tmpSwap = vArray(ii)
            vArray(ii) = vArray(jj): vArray(jj) = tmpSwap
            ii = ii + 1: jj = jj - 1
         End If
      Wend
      If (i < jj) Then SortArray vArray, i, jj
      If (ii < j) Then SortArray vArray, ii, j
    End Sub
    

    SNAPSHOTS

    BEFORE

    enter image description here

    AFTER

    enter image description here

    0 讨论(0)
  • 2020-12-28 10:57

    I chose a rather lame theory, but it seems to work (at least if I got the question right cuz sometimes I'm a slow understander). I load the entire text into a string, load the individual words into an array, loop through the array and concatenate the string, containing each time three consecutive words.
    Because the results are already included in 3 word groups, 4 word groups or more will automatically be recognized.

    Option Explicit
    
    Sub Find_Duplicates()
    
    On Error GoTo errHandler
    
    Dim pSingleLine                     As Paragraph
    Dim sLine                           As String
    Dim sFull_Text                      As String
    Dim vArray_Full_Text                As Variant
    
    Dim sSearch_3                       As String
    Dim lSize_Array                     As Long
    Dim lCnt                            As Long
    Dim lCnt_Occurence                  As Long
    
    
    'Create a string from the entire text
    For Each pSingleLine In ActiveDocument.Paragraphs
        sLine = pSingleLine.Range.Text
        sFull_Text = sFull_Text & sLine
    Next pSingleLine
    
    'Load the text into an array
    vArray_Full_Text = sFull_Text
    vArray_Full_Text = Split(sFull_Text, " ")
    lSize_Array = UBound(vArray_Full_Text)
    
    
    For lCnt = 1 To lSize_Array - 1
        lCnt_Occurence = 0
        sSearch_3 = Trim(fRemove_Punctuation(vArray_Full_Text(lCnt - 1) & _
                        " " & vArray_Full_Text(lCnt) & _
                        " " & vArray_Full_Text(lCnt + 1)))
    
        With Selection.Find
            .Text = sSearch_3
            .Forward = True
            .Replacement.Text = ""
            .Wrap = wdFindContinue
            .Format = False
            .MatchCase = False
    
            Do While .Execute
    
                lCnt_Occurence = lCnt_Occurence + 1
                If lCnt_Occurence > 1 Then
                    Selection.Range.Font.Color = vbRed
                End If
                Selection.MoveRight
            Loop
        End With
    
        Application.StatusBar = lCnt & "/" & lSize_Array
    Next lCnt
    
    errHandler:
    Stop
    
    End Sub
    
    Public Function fRemove_Punctuation(sString As String) As String
    
    Dim vArray(0 To 8)      As String
    Dim lCnt                As Long
    
    
    vArray(0) = "."
    vArray(1) = ","
    vArray(2) = ","
    vArray(3) = "?"
    vArray(4) = "!"
    vArray(5) = ";"
    vArray(6) = ":"
    vArray(7) = "("
    vArray(8) = ")"
    
    For lCnt = 0 To UBound(vArray)
        If Left(sString, 1) = vArray(lCnt) Then
            sString = Right(sString, Len(sString) - 1)
        ElseIf Right(sString, 1) = vArray(lCnt) Then
            sString = Left(sString, Len(sString) - 1)
        End If
    Next lCnt
    
    fRemove_Punctuation = sString
    
    End Function
    

    The code assumes a continuous text without bullet points.

    0 讨论(0)
提交回复
热议问题