Scraper throws errors instead of quitting the browser when everything is done

若如初见. 提交于 2019-11-27 09:27:00

Ok, so there is something seriously unfriendly about that webpage. It kept crashing for me. So I have resorted to running a javascript program within scripting engine/scripting control and it works.

I hope you can follow it. The logic is in the javascript added to the ScriptEngine. I get two lists of nodes, one list of films and one list of years; then I step through each array in sync and add them as key value pair to a Microsoft Scripting Dictionary.

Option Explicit

'*Tools->References
'*    Microsoft Scripting Runtime
'*    Microsoft Scripting Control
'*    Microsoft Internet Controls
'*    Microsoft HTML Object Library

Sub Torrent_Data()
    Dim row As Long
    Dim IE As New InternetExplorer, html As HTMLDocument
    Dim post As Object

    With IE
        .Visible = True
        .navigate "https://yts.am/browse-movies"
        Do While .readyState <> READYSTATE_COMPLETE:
            DoEvents
        Loop
        Set html = .document
    End With

    Dim dicFilms As Scripting.Dictionary
    Set dicFilms = New Scripting.Dictionary

    Call GetScriptEngine.Run("getMovies", html, dicFilms)

    Dim vFilms As Variant
    vFilms = dicFilms.Keys

    Dim vYears As Variant
    vYears = dicFilms.Items

    Dim lRowLoop As Long
    For lRowLoop = 0 To dicFilms.Count - 1

        Cells(lRowLoop + 1, 1) = vFilms(lRowLoop)
        Cells(lRowLoop + 1, 2) = vYears(lRowLoop)

    Next lRowLoop

    Stop

    IE.Quit
End Sub

Private Function GetScriptEngine() As ScriptControl
    '* see code from this SO Q & A
    ' https://stackoverflow.com/questions/37711073/in-excel-vba-on-windows-how-to-get-stringified-json-respresentation-instead-of
    Static soScriptEngine As ScriptControl
    If soScriptEngine Is Nothing Then
        Set soScriptEngine = New ScriptControl
        soScriptEngine.Language = "JScript"

        soScriptEngine.AddCode "function getMovies(htmlDocument, microsoftDict) { " & _
                                    "var titles = htmlDocument.querySelectorAll('a.browse-movie-title'), i;" & _
                                    "var years = htmlDocument.querySelectorAll('div.browse-movie-year'), j;" & _
                                    "if ( years.length === years.length) {" & _
                                    "for (i=0; i< years.length; ++i) {" & _
                                    "   var film = titles[i].innerText;" & _
                                    "   var year = years[i].innerText;" & _
                                    "   microsoftDict.Add(film, year);" & _
                                    "}}}"

    End If
    Set GetScriptEngine = soScriptEngine
End Function

The website has an API. Check e. g. result from the URL https://yts.am/api/v2/list_movies.json?page=1&limit=50, which actually represents 50 movies from first page of latest movies category, in JSON format.

Take a look at the below example. Import JSON.bas module into the VBA project for JSON processing.

Option Explicit

Sub Test()

    Dim sJSONString As String
    Dim vJSON
    Dim sState As String
    Dim lPage As Long
    Dim aRes()
    Dim i As Long
    Dim aData()
    Dim aHeader()

    With Sheets(1)
        .Cells.Delete
        .Cells.WrapText = False
    End With
    lPage = 1
    aRes = Array()
    Do
        With CreateObject("MSXML2.XMLHTTP")
            .Open "GET", "https://yts.am/api/v2/list_movies.json?page=" & lPage & "&limit=50", False
            .send
            sJSONString = .responseText
        End With
        JSON.Parse sJSONString, vJSON, sState
        If Not vJSON("data").Exists("movies") Then Exit Do
        vJSON = vJSON("data")("movies")
        ReDim Preserve aRes(UBound(aRes) + UBound(vJSON) + 1)
        For i = 0 To UBound(vJSON)
            Set aRes(UBound(aRes) - UBound(vJSON) + i) = vJSON(i)
        Next
        lPage = lPage + 1
        Debug.Print "Parsed " & (UBound(aRes) + 1)
        DoEvents
    Loop
    JSON.ToArray aRes, aData, aHeader
    With Sheets(1)
        OutputArray .Cells(1, 1), aHeader
        Output2DArray .Cells(2, 1), aData
        .Columns.AutoFit
    End With
    MsgBox "Completed"

End Sub

Sub OutputArray(oDstRng As Range, aCells As Variant)

    With oDstRng
        .Parent.Select
        With .Resize(1, UBound(aCells) - LBound(aCells) + 1)
            .NumberFormat = "@"
            .Value = aCells
        End With
    End With

End Sub

Sub Output2DArray(oDstRng As Range, aCells As Variant)

    With oDstRng
        .Parent.Select
        With .Resize( _
                UBound(aCells, 1) - LBound(aCells, 1) + 1, _
                UBound(aCells, 2) - LBound(aCells, 2) + 1)
            .NumberFormat = "@"
            .Value = aCells
        End With
    End With

End Sub

The output for me as follows, at the moment there are 7182 movies total:

BTW, the similar approach applied in the following answers: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 and 15.

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!