Parse ATOM rss feed and remove html tags

问题

am developing this code using powershell. I need to be able to extract the html tags.

  Invoke-WebRequest -Uri 'https://psu.box.com/shared/static/jf36ohodxnw7oemghsau1t7qb0w4y708.rss' -  OutFile C:\users\anr2809\Documents\alerts.txt
  [xml]$Content = Get-Content C:\users\anr2809\Documents\alerts.txt -Raw
  $Regex = '(?s)SE1046.*?Description := "(?<Description>.*?)"'

 If ($Content -match $Regex) {
      "Description is '$($Matches['Description'])'"
      # do something here with $Matches['Description']
    }
 Else {
    "No match."
      }
   $Feed = $Content.rss.channel
 ForEach ($msg in $Feed.Item){
     $ParseData = (($msg.description))
    ForEach ($Datum in $ParseData){
     If ($Datum -like "Title"){[int]$Upvote = ($Datum).split(' ') | Select-Object -First 1}#EndIf
     If ($Datum -like "comments"){[int]$Downvote = ($Datum).split(' ') | Select-Object -First 1}    #EndIf
    }#EndForEach
     [PSCustomObject]@{
     'LastUpdated' = [datetime]$msg.pubDate
     'Title' = $msg.title
     'Category' = $msg.category
     'Author' = $msg.author
     'Link' = $msg.link
     'UpVotes' = $Upvote
     'DownVotes' = $Downvote
     'Validations' = $Validation
     'WorkArounds' = $Workaround
     'Comments' = $msg.description.InnerText                   
     'FeedbackID' = $FeedBackID
    }#EndPSCustomObject
   }

This is the results, and I would like to remove the html tags.

LastUpdated : 3/30/2020 9:45:52 AM
Title       : Enterprise Network Planned Outage
Category    : 
Author      : 
Link        : link
UpVotes     : 
DownVotes   : 
Validations : 
WorkArounds : 
Comments    : 
                    <p><strong>People and Locations Impacted:</strong><br />All    students, faculty, and staff at all State locations<br /><br />
FeedbackID  :

回答1:

You could replace <br/> with actual line breaks, then tag-strip the rest completely:

$commentsPlain = $msg.description.InnerText -replace '<br ?/?>',[System.Environment]::NewLine -replace '<[^>]+>'

[PSCustomObject]@{
    'LastUpdated' = [datetime]$msg.pubDate
    'Title' = $msg.title
    'Category' = $msg.category
    'Author' = $msg.author
    'Link' = $msg.link
    'UpVotes' = $Upvote
    'DownVotes' = $Downvote
    'Validations' = $Validation
    'WorkArounds' = $Workaround
    'Comments' = $commentsPlain
    'FeedbackID' = $FeedBackID
}

回答2:

You should be able to use the following script. It makes use of the HTMLFile com object.

  Invoke-WebRequest -Uri 'https://*.rss' -  OutFile C:\*.rss
  [xml]$Content = Get-Content C:\*.rss -Raw
  $Regex = '(?s)SE1046.*?Description := "(?<Description>.*?)"'

 If ($Content -match $Regex) {
      "Description is '$($Matches['Description'])'"
      # do something here with $Matches['Description']
    }
 Else {
    "No match."
      }
   $Feed = $Content.rss.channel
 ForEach ($msg in $Feed.Item){


     $ParseData = $msg.description
    ForEach ($Datum in $ParseData){
     If ($Datum -like "Title"){[int]$Upvote = ($Datum).split(' ') | Select-Object -First 1}#EndIf
     If ($Datum -like "comments"){[int]$Downvote = ($Datum).split(' ') | Select-Object -First 1}    #EndIf
    }#EndForEach     

    $HTML = New-Object -ComObject "HTMLFile"
    $HTML.IHTMLDocument2_write($ParseData.InnerText)

     [PSCustomObject]@{
     'LastUpdated' = [datetime]$msg.pubDate
     'Title' = $msg.title
     'Category' = $msg.category
     'Author' = $msg.author
     'Link' = $msg.link
     'UpVotes' = $Upvote
     'DownVotes' = $Downvote
     'Validations' = $Validation
     'WorkArounds' = $Workaround
     'Comments' = $HTML.all.tags("p") | % InnerText           
     'FeedbackID' = $FeedBackID
    }#EndPSCustomObject
   }

来源：https://stackoverflow.com/questions/61159262/parse-atom-rss-feed-and-remove-html-tags

标签

powershell

powershell-4.0