Cycle through webpages and copy data

前端 未结 2 1439
我在风中等你
我在风中等你 2021-01-20 00:26

I created this script for a friend that cycles through a real estate website and snags email address for her (for promotion). The site offers them freely, but it\'s inconve

2条回答
  •  情歌与酒
    2021-01-20 01:03

    Here is true jedi approach :) uses only XMLHttpRequests, there aren't IE disadvantages or dependencies from it. Output window created dynamically via mshta without temp files. Processing speed can be improved by implementing async requests or multiprocess environment. The only way to stop the script at the moment unfortunately is wscript.exe process termination.

    Option Explicit
    
    Dim oDisplay, sUrl, sRespHeaders, sRespText, arrSetHeaders, sEventTarget, arrFormData, lPage, lMember, i, arrFormStrings, sFormData, arrMembers, arrMemeber, sUrlEmail, sRespTextEmail, sEmail
    
    Set oDisplay = New OutputWindow
    sUrl = "https://netforum.avectra.com/eweb/DynamicPage.aspx?Site=NEFAR&WebCode=IndResult&FromSearchControl=Yes"
    lPage = 0
    lMember = 0
    
    ' Initial webpage request
    oDisplay.Write("Connecting " & vbCrLf & sUrl)
    XmlHttpRequest "GET", sUrl, Array(), "", sRespHeaders, sRespText
    
    ' Loop through all pages
    Do
        ' Get cookies, form data, listctrl
        oDisplay.Write("Processing page #" & (lPage + 1))
        sEventTarget = ParseFragm("__doPostBack\('(ListControl_[\s\S]*?)',", sRespText)
        ParseResponse "^Set-(Cookie): ([\S]*?=[\S]*?);[\s\S]*?$", sRespHeaders, arrSetHeaders
        ParseResponse "", sRespText, arrFormData
    
        ' Update form params
        For i = 0 To UBound(arrFormData)
            Select Case arrFormData(i)(0)
            Case "__POSTBACKCONTROL"
                arrFormData(i)(1) = "JumpToPage"
            Case "__EVENTTARGET"
                arrFormData(i)(1) = sEventTarget
            Case "__EVENTARGUMENT"
                arrFormData(i)(1) = CStr(lPage)
            End Select
        Next
    
        ' Jump to page #lPage
        arrFormStrings = Array()
        ReDim arrFormStrings(UBound(arrFormData))
        For i = 0 To UBound(arrFormData)
            arrFormStrings(i) = EncodeUriComponent(arrFormData(i)(0)) & "=" & EncodeUriComponent(arrFormData(i)(1))
        Next
        sFormData = Join(arrFormStrings, "&")
        PushItem arrSetHeaders, Array("Content-Type", "application/x-www-form-urlencoded")
        PushItem arrSetHeaders, Array("Content-Length", CStr(Len(sFormData)))
    
        ' New page POST request
        XmlHttpRequest "POST", sUrl, arrSetHeaders, sFormData, sRespHeaders, sRespText
    
        ' Parse members from new page
        ParseMembers sRespText, arrMembers
    
        ' Parse members emails, and output 
        For Each arrMemeber in arrMembers
            lMember = lMember + 1
            sUrlEmail = "https://netforum.avectra.com/eweb/DynamicPage.aspx?Site=NEFAR&WebCode=PrimaryContactInfo&ind_cst_key=" & arrMemeber(0)
            XmlHttpRequest "GET", sUrlEmail, Array(), "", "", sRespTextEmail
            sEmail = ParseFragm("""mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6})""", sRespTextEmail)
            oDisplay.WriteTable(Array(CStr(lMember), sEmail, arrMemeber(0), arrMemeber(1)))
        Next
    
        lPage = lPage + 1
    Loop
    
    
    Sub ParseResponse(sPattern, sResponse, arrData)
        Dim oMatch
        arrData = Array()
        With CreateObject("VBScript.RegExp")
            .Global = True
            .MultiLine = True
            .Pattern = sPattern
            For Each oMatch In .Execute(sResponse)
                PushItem arrData, Array(oMatch.SubMatches(0), oMatch.SubMatches(1))
            Next
        End With
    End Sub
    
    Function ParseFragm(sPattern, sResponse)
        Dim oMatches
        With CreateObject("VBScript.RegExp")
            .Global = True
            .MultiLine = True
            .Pattern = sPattern
            Set oMatches = .Execute(sResponse)
            If oMatches.Count > 0 Then ParseFragm = oMatches(0).SubMatches(0)
        End With
    End Function
    
    Sub ParseMembers(sRespText, arrMembers)
        Dim oMatch
        arrMembers = Array()
        With CreateObject("VBScript.RegExp")
            .Global = True
            .MultiLine = True
            .Pattern = "([\s\S]*?<[\s\S]*?Key=([\s\S]*?)&[\s\S]*?)"
            For Each oMatch In .Execute(sRespText)
                PushItem arrMembers, Array(oMatch.SubMatches(1), GetInnerText(oMatch.SubMatches(0)))
            Next
        End With
    End Sub
    
    Sub PushItem(arrList, varItem)
        ReDim Preserve arrList(UBound(arrList) + 1)
        arrList(UBound(arrList)) = varItem
    End Sub
    
    Function EncodeUriComponent(sText)
        With CreateObject("htmlfile")
            .Write ("")
            EncodeUriComponent = .DocumentElement.Document.Script.EncodeUriComponent(sText)
        End With
    End Function
    
    Function GetInnerText(sText)
        With CreateObject("htmlfile")
            .Write ("" & sText & "")
            GetInnerText = .DocumentElement.Document.GetElementsByTagName("body")(0).InnerText
        End With
    End Function
    
    Sub XmlHttpRequest(sMethod, sUrl, arrSetHeaders, sFormData, sRespHeaders, sRespText)
        Dim arrHeader
        With CreateObject("Msxml2.ServerXMLHTTP.3.0")
            .SetOption 2, 13056 ' SXH_SERVER_CERT_IGNORE_ALL_SERVER_ERRORS
            .Open sMethod, sUrl, False
            For Each arrHeader In arrSetHeaders
                .SetRequestHeader arrHeader(0), arrHeader(1)
            Next
            .Send sFormData
            sRespHeaders = .GetAllResponseHeaders
            sRespText = .ResponseText
        End With
    End Sub
    
    Class OutputWindow
    
        Dim oWnd, oDoc, oOutDiv, oCursorDiv, oOutTBody, sSignature, lCols
    
        Private Sub Class_Initialize()
            sSignature = "OutputWindow"
            ProvideWindow()
        End Sub
    
        Private Sub ProvideWindow()
            ' source http://forum.script-coding.com/viewtopic.php?pid=75356#p75356
            Dim lWidth, lHeight
            GetWindow()
            If oWnd Is Nothing Then
                CreateWindow()
                With oWnd
                    With .Document
                        .GetElementsByTagName("head")(0).AppendChild .CreateElement("style")
                        .stylesheets(0).cssText = "body, td, #output {font-family: consolas, courier new; font-size: 9pt;} #cursor {margin: 3px;} body {background-color: buttonface;} #output {height: 100%; width: 100%; overflow: scroll; background: #FFF;} div.hline {height: 1px; width: 100%; background-color: #000; overflow: hidden;} table {width: 100%; TEXT-ALIGN: center; border-COLLAPSE: collapse; background: transparent; margin-top: 1px;} td {border: black 1px solid;}"
                        .Title = "Output Window"
                        .Body.InnerHtml = "
    " End With lWidth = CInt(.Screen.AvailWidth * 0.75) lHeight = CInt(.Screen.AvailHeight * 0.75) .ResizeTo .Screen.AvailWidth, .Screen.AvailHeight .ResizeTo lWidth + .Screen.AvailWidth - .Document.Body.OffsetWidth, lHeight + .Screen.AvailHeight - .Document.Body.OffsetHeight .MoveTo CInt((.Screen.AvailWidth - lWidth) / 2), CInt((.Screen.AvailHeight - lHeight) / 2) End With End If Set oDoc = oWnd.Document Set oOutDiv = oWnd.output Set oCursorDiv = oWnd.cursor lCols = -1 End Sub Private Sub GetWindow() Dim oShellWnd On Error Resume Next For Each oShellWnd In CreateObject("Shell.Application").Windows Set oWnd = oShellWnd.GetProperty(sSignature) If Err.Number = 0 Then Exit Sub Err.Clear Next Set oWnd = Nothing End Sub Private Sub CreateWindow() Dim oProc Do Set oProc = CreateObject("WScript.Shell").exec("mshta ""about:""") Do If oProc.Status > 0 Then Exit Do GetWindow() If Not (oWnd Is Nothing) Then Exit Sub Loop Loop End Sub Private Sub ChkDoc() On Error Resume Next If TypeName(oDoc) <> "HTMLDocument" Then ProvideWindow() End Sub Public Sub Write(sText) Dim oDiv ChkDoc() On Error Resume Next Set oDiv = oDoc.CreateElement("div") oDiv.InnerHtml = EscapeHtml(sText) & "
    " oOutDiv.AppendChild oDiv oOutDiv.AppendChild oCursorDiv oOutDiv.ScrollTop = oOutDiv.ScrollHeight lCols = -1 End Sub Public Sub WriteTable(arrCells) Dim sInner, oTable, oRow, oTr, oCell, n ChkDoc() On Error Resume Next If UBound(arrCells) <> lCols Then Set oTable = oDoc.CreateElement("table") oOutDiv.AppendChild oTable Set oOutTBody = oDoc.CreateElement("tbody") oTable.AppendChild oOutTBody lCols = UBound(arrCells) End If Set oTr = oDoc.CreateElement("tr") oOutTBody.AppendChild oTr For n = 0 To lCols Set oCell = oTr.InsertCell(n) oCell.InnerHtml = EscapeHtml(arrCells(n)) Next oOutDiv.AppendChild oCursorDiv oOutDiv.ScrollTop = oOutDiv.ScrollHeight End Sub Public Sub BreakTable() lCols = -1 End Sub Private Function EscapeHtml(sCnt) Dim n sCnt = Replace(sCnt, "&", "&") sCnt = Replace(sCnt, """", """) sCnt = Replace(sCnt, "<", "<") sCnt = Replace(sCnt, ">", ">") sCnt = Replace(sCnt, "'", "'") sCnt = Replace(sCnt, vbCrLf, "
    ") sCnt = Replace(sCnt, Chr(9), "    ") sCnt = Replace(sCnt, " ", "  ") sCnt = Replace(sCnt, "  ", "  ") For n = 0 To 31 sCnt = Replace(sCnt, Chr(n), "¶") Next EscapeHtml = sCnt End Function Private Sub Class_Terminate() ' oWnd.close End Sub End Class

提交回复
热议问题