Scraping text from file within HTML tags

前端 未结 2 1632
长发绾君心
长发绾君心 2020-12-18 17:08

I have a file that I want to extract dates from, it\'s a HTML source file so it\'s full of code and phrases I don\'t need. I need to extract every instance of a date that\'s

2条回答
  •  离开以前
    2020-12-18 17:33

    If you're using Excel VBA, set a reference (Tools - References) to the MSHTML library (entitled Microsoft HTML Object Library in the reference menu)

    Sub ScrapeDateAbbr()
    
        Dim hDoc As MSHTML.HTMLDocument
        Dim hElem As MSHTML.HTMLGenericElement
        Dim sFile As String, lFile As Long
        Dim sHtml As String
    
        'read in the file
        lFile = FreeFile
        sFile = "C:/Users/dick/Documents/My Dropbox/Excel/Testabbr.html"
        Open sFile For Input As lFile
        sHtml = Input$(LOF(lFile), lFile)
    
        'put into an htmldocument object
        Set hDoc = New MSHTML.HTMLDocument
        hDoc.body.innerHTML = sHtml
    
        'loop through abbr tags
        For Each hElem In hDoc.getElementsByTagName("abbr")
            'only those that have a data-utime attribute
            If Len(hElem.getAttribute("data-utime")) > 0 Then
                'get the title attribute
                Debug.Print hElem.getAttribute("title")
            End If
        Next hElem
    
    End Sub
    

    I assumed the file was local since you called in a source file. If you need to download it first, you'd need another reference to MSXML and this code

    Sub ScrapeDateAbbrDownload()
    
        Dim xHttp As MSXML2.XMLHTTP
        Dim hDoc As MSHTML.HTMLDocument
        Dim hElem As MSHTML.HTMLGenericElement
    
        Set xHttp = New MSXML2.XMLHTTP
        xHttp.Open "GET", "file:///C:/Users/dick/Documents/My%20Dropbox/Excel/Testabbr.html"
        xHttp.send
    
        Do
            DoEvents
        Loop Until xHttp.readyState = 4
    
        'put into an htmldocument object
        Set hDoc = New MSHTML.HTMLDocument
        hDoc.body.innerHTML = xHttp.responseText
    
        'loop through abbr tags
        For Each hElem In hDoc.getElementsByTagName("abbr")
            'only those that have a data-utime attribute
            If Len(hElem.getAttribute("data-utime")) > 0 Then
                'get the title attribute
                Debug.Print hElem.getAttribute("title")
            End If
        Next hElem
    
    End Sub
    

提交回复
热议问题