Excel VBA to Search for Text in PDF and Extract and Name Pages

后端 未结 3 1427
情深已故
情深已故 2020-12-17 07:23

I have the following code, which looks at each cell in column A of my spreadsheet, searches for the text it finds there in the specified PDF and then extracts the page where

相关标签:
3条回答
  • 2020-12-17 07:28
    Sub BatchRenameCS()
    
    Dim objApp As Object
    Dim objPDDoc As Object
    Dim objjso As Object
    Dim newPDF As Acrobat.CAcroPDDoc
    Dim lastrow2 As Long
    Dim strFileName As String
    Dim Folder As String
    Dim Page As Long
    Dim Cell As Long
    Dim PDFCharacterCount() As Long
    Dim CharacterCount As Long
    Dim i As Integer
    Dim c As Integer
    Dim x As Integer
    Dim strSource As String
    Dim strResult As String
    Dim PDFCharacters As String
    Dim PDFCharacters2 As String
    Dim PDFPasteData() As String
    Dim PasteDataPage As Integer
    Dim LastRow As Long
    Dim NewName As String
    Dim NewNamePageNum As Integer
    Dim Check()
    
    Sheets("Sheet1").Range("C:D").ClearContents
    
    strFileName = selectFile()
    Folder = GetFolder()
    
    'create array with pdf word count
    Set objApp = CreateObject("AcroExch.App")
    Set objPDDoc = CreateObject("AcroExch.PDDoc")
    'AD.1 open file, if =false file is damage
        If objPDDoc.Open(strFileName) Then
            Set objjso = objPDDoc.GetJSObject
    
    ReDim PDFCharacterCount(1 To objPDDoc.GetNumPages) As Long
    
    For Page = 1 To objPDDoc.GetNumPages
    PDFCharacters = ""
    PDFCharacters2 = ""
        For c = 0 To objjso.GetPageNumWords(Page - 1)
        PDFCharacters = PDFCharacters & objjso.getPageNthWord(Page - 1, c)
        Next c
        For i = 1 To Len(PDFCharacters)
            Select Case Asc(Mid(PDFCharacters, i, 1))
                Case 48 To 57, 65 To 90, 97 To 122:
                PDFCharacters2 = PDFCharacters2 & (Mid(PDFCharacters, i, 1))
                Case Else
                PDFCharacters2 = PDFCharacters2 & ""
            End Select
        Next
        PDFCharacterCount(Page) = Len(PDFCharacters2)
    
    Next Page
    
    lastrow2 = Sheets("Sheet2").Cells(Rows.Count, "A").End(xlUp).Row
    Page = 1
    ReDim PDFPasteData(1 To objPDDoc.GetNumPages) As String
    For Cell = 1 To lastrow2
        strResult = ""
        strSource = Sheets("Sheet2").Cells(Cell, 1).Text
        PDFPasteData(Page) = PDFPasteData(Page) & " " & strSource
        For i = 1 To Len(strSource)
            Select Case Asc(Mid(strSource, i, 1))
                Case 48 To 57, 65 To 90, 97 To 122:
                strResult = strResult & (Mid(strSource, i, 1))
                Case Else
                strResult = strResult & ""
            End Select
        Next
    
    CharacterCount = CharacterCount + Len(strResult)
    
    If CharacterCount = PDFCharacterCount(Page) Then
    CharacterCount = 0
    Page = Page + 1
    End If
    
    Next Cell
    ReDim Check(2, objPDDoc.GetNumPages)
    LastRow = Sheets("Sheet1").Cells(Rows.Count, "A").End(xlUp).Row
    For Each LookUpCell In Worksheets("Sheet1").Range("A2:A" & LastRow)
        For PasteDataPage = 1 To objPDDoc.GetNumPages
            If InStr(PDFPasteData(PasteDataPage), LookUpCell.Value) Then
            Check(1, PasteDataPage) = Check(1, PasteDataPage) + 1
            Check(2, PasteDataPage) = Check(2, PasteDataPage) & LookUpCell.Offset(0, 1).Value & Chr(10)
                                    If FileExist(Folder & "\" & LookUpCell.Offset(0, 1) & ".pdf") Then
    
                                            Set newPDF = CreateObject("AcroExch.pdDoc")
                                            NewName = Folder & "\" & LookUpCell.Offset(0, 1) & ".pdf"
                                            newPDF.Open (NewName)
                                            newPDF.InsertPages newPDF.GetNumPages - 1, objPDDoc, PasteDataPage - 1, 1, 0
                                            newPDF.Save 1, NewName
                                            newPDF.Close
                                            Set newPDF = Nothing
                                     Else
                                            Set newPDF = CreateObject("AcroExch.PDDoc")
                                            newPDF.Create
                                            NewName = Folder & "\" & LookUpCell.Offset(0, 1) & ".pdf"
                                            newPDF.InsertPages -1, objPDDoc, PasteDataPage - 1, 1, 0
                                            newPDF.Save 1, NewName
                                            newPDF.Close
                                            Set newPDF = Nothing
                                    End If
            End If
        Next PasteDataPage
    Next LookUpCell
    x = 1
    For PasteDataPage = 1 To objPDDoc.GetNumPages
        If Check(1, PasteDataPage) <> 1 Then
        Sheets("Sheet1").Cells(x, 3) = PasteDataPage
        Sheets("Sheet1").Cells(x, 4) = Check(2, PasteDataPage)
        x = x + 1
        End If
    Next PasteDataPage
    End If
    MsgBox "Done"
    End Sub
    Function FileExist(path As String) As Boolean
        If Dir(path) <> vbNullString Then FileExist = True
    End Function
    Function selectFile()
    Dim fd As FileDialog, fileName As String
    On Error GoTo ErrorHandler
    Set fd = Application.FileDialog(msoFileDialogFilePicker)
    fd.AllowMultiSelect = False
    If fd.Show = True Then
        If fd.SelectedItems(1) <> vbNullString Then
            fileName = fd.SelectedItems(1)
        End If
    Else
        'Exit code if no file is selected
        End
    End If
    'Return Selected FileName
    selectFile = fileName
    Set fd = Nothing
    Exit Function
    ErrorHandler:
    Set fd = Nothing
    MsgBox "Error " & Err & ": " & Error(Err)
    End Function
    Function GetFolder() As String
        Dim fldr As FileDialog
        Dim sItem As String
        Set fldr = Application.FileDialog(msoFileDialogFolderPicker)
        With fldr
            .Title = "Select the Folder where you want you new PDFs to go"
            .AllowMultiSelect = False
            .InitialFileName = Application.DefaultFilePath
            If .Show <> -1 Then GoTo NextCode
            sItem = .SelectedItems(1)
        End With
    NextCode:
        GetFolder = sItem
        Set fldr = Nothing
    
    End Function
    
    0 讨论(0)
  • 2020-12-17 07:37

    Loops are definitely excellent for some things, but can tie down processing with these higher queries. Recently, a colleague and I were doing a similar task (not pdf-related though), and we had much success with using a range.find method instead of a loop executing instr on each cell.

    Some points of interest: -To mimic the “loop cells” functionality when using the .find method, we ended our range statement with .cells, as seen below:

    activesheet.usedrange.cells.find( )

    Where the desired string goes within the ( ).

    -The return value: “A Range object that represents the first cell where that information is found.”

    Once the .find method returns a range, a subsequent subroutine can extract the page number and document name.

    -If you need to find the nth instance of an occurrence, “You can use the FindNext andFindPrevious methods to repeat the search.” (Microsoft)

    Microsoft overview of range.find: https://msdn.microsoft.com/en-us/vba/excel-vba/articles/range-find-method-excel

    So with this approach, the user can use a loop based on a count of cells in your list to execute the .find method for each string.

    Downside is (I assume) that this must be done on text within the excel application; also, I’ve not tested it to determine if the string has to inhabit the cell by itself (I don’t think this is a concern).

    ‘===================

    Another suggestion that might be beneficial is to first bulk-rip all text from the .pdf with as little looping as possible (direct actions at the document object level). Then your find/return approach can be applied to the bulk text.

    I did a similar activity when creating study notes from a professor’s PowerPoints; I grabbed all the text into a .txt file, then returned every sentence containing the instance of a list of strings.

    ‘=====================

    A few caveats: I admit that I have not executed parsing at the sheer size of your project, so my suggestions might not be advantageous in practice.

    Also, I have not done much work parsing .pdf documents, as I try to opt for anything that is .txt/excel app first, and engage it instead.

    Good luck in your endeavors; I hope I was able to at least provide food for thought!

    0 讨论(0)
  • 2020-12-17 07:39

    Sorry to post a quick, incomplete answer, but I think I can point you in a good direction.

    Instead of making the system look up the two terms hundreds of billions of times, then make hundreds of billions of comparisons, put your search terms into an array, and the text of each page into a long string.Then it only has to do one look up and 200 comparisons per page.

    'Dim your Clipboard functions
    Public Declare PtrSafe Function OpenClipboard Lib "user32" (ByVal hwnd As Long) As Long
    Public Declare PtrSafe Function EmptyClipboard Lib "user32" () As Long
    Public Declare PtrSafe Function CloseClipboard Lib "user32" () As Long
    
    '...
    
    Dim objData As New MSForms.DataObject
    Dim arrSearch() As String
    Dim strTxt As String
    
    '...
    
    'Create array of search terms
    For i = 2 To lastrow
        arrSearch(i - 2) = Sheets("Sheet1").Cells(1, i)
    Next i
    
    For page = 0 To objPDDoc.GetNumPages - 1
    
        '[Move each page into a new document. You already have that code]
    
        'Clear clipboard
        OpenClipboard (0&)
        EmptyClipboard
        CloseClipboard
    
        'Copy page to clipboard
        objApp.MenuItemExecute ("SelectAll")
        objApp.MenuItemExecute ("Copy")
        'You can also do this with the JavaScript object: objjso.ExecMenuItem("Item Name")
        'You may have to insert a waiting function like sleep() here to wait for the action to complete
    
        'Put data from clipboard into a string.
        objData.GetFromClipboard
        strTxt = objData.GetText 'Now you can search the entire content of the page at once, within memory
    
        'Compare each element of the array to the string
        For i = LBound(arrSearch) To UBound(arrSearch)
            If InStr(1, strTxt, arrSearch(i)) > 0 Then
                '[You found a match. Your code here]
            End If
        Next i
    
    Next page
    

    This is still cumbersome because you have to open each page in a new document. If there is a good way to determine which page you're on purely by text (such as the page number at the bottom of page a, followed immediately by the header at the top of page b) then you might look at copying the entire text of the document into one string, then using the clues from the text to decide which page to extract once you find a match. That would be a lot faster I believe.

    0 讨论(0)
提交回复
热议问题