Replace relative urls to absolute

有些话、适合烂在心里 提交于 2019-12-04 18:16:55
L.B

Don't try to parse html with regex as expained here https://stackoverflow.com/a/1732454/932418 and https://stackoverflow.com/a/1758162/932418

Use an html parser like HtmlAgilityPack instead

string html = 
@"<html>
    <head>
            <link rel=""stylesheet"" type=""text/css"" href=""/css/all.css"" /> 
    </head>
    <body>
        <a href=""/test.aspx"">Test</a>
        <a href=""http://example.com"">Test</a>
        <img src=""/images/test.jpg""/>
        <img src=""http://example.com/images/test.jpg""/>
    </body>
</html>";

StringWriter writer = new StringWriter();
string baseUrl= "http://example.com";
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(html);

foreach(var img in doc.DocumentNode.Descendants("img"))
{
    img.Attributes["src"].Value = new Uri(new Uri(baseUrl), img.Attributes["src"].Value).AbsoluteUri;
}

foreach (var a in doc.DocumentNode.Descendants("a"))
{
    a.Attributes["href"].Value = new Uri(new Uri(baseUrl), a.Attributes["href"].Value).AbsoluteUri;
}

doc.Save(writer);

string newHtml = writer.ToString();

Add

<base href="http://mysite.com/images/" />

To the head of the page

Use regular expressions for this. Here is short example

static void Main(string[] args)
    {
        string input = "<html>\n<head>\n<link rel=\"stylesheet\" type=\"text/css\" href=\"/css/all.css\" /> \n</head>\n<body>\n<a href=\"/test.aspx\">Test</a>\n<a href=\"http://mysite.com\">Test</a>\n<img src=\"/images/test.jpg\"/>\n<img src=\"http://mysite.com/images/test.jpg\"/>\n</body>\n</html>";
        string pattern = "((?:src|href)[\\s]*?)(?:\\=[\\s]*?[\\\"\\\'])[\\/*\\\\*]?(?!..+[s]?\\:[\\/]*)(.*?)(?:[\\s\\\"\\\'])";
        var reg = new Regex(pattern, RegexOptions.IgnoreCase);
        string prefix = @"http://mysite.com";
        var result = reg.Replace(input, "$1=\""+prefix+"$2\"");
    }

the result is

<html>
<head>
<link rel="stylesheet" type="text/css" href="http://mysite.com/css/all.css" /> 
</head>
<body>
<a href="http://mysite.com/test.aspx">Test</a>
<a href="http://mysite.com">Test</a>
<img src="http://mysite.com/images/test.jpg"/>
<img src="http://mysite.com/images/test.jpg"/>
</body>
</html>
Full-Stack Software Engineer

Check this out, it could help you.

It is in the following format: http(s)://domain(:port)/AppPath)

HttpContext.Current.Request.Url.Scheme + "://" + HttpContext.Current.Request.Url.Authority + HttpContext.Current.Request.ApplicationPath;

Or you could use:

Page.ResolveUrl("img/youFile");
Mahmoud

Look at this function:

Private Function ConvertALLrelativeLinksToAbsoluteUri(ByVal html As String, ByVal PageURL As String)
    Dim result As String = Nothing
    ' Getting all Href
    Dim opt As New RegexOptions
    Dim XpHref As New Regex("(href="".*?"")", RegexOptions.IgnoreCase)
    Dim i As Integer
    Dim NewSTR As String = html
    For i = 0 To XpHref.Matches(html).Count - 1
        Application.DoEvents()
        Dim Oldurl As String = Nothing
        Dim OldHREF As String = Nothing
        Dim MainURL As New Uri(PageURL)
        OldHREF = XpHref.Matches(html).Item(i).Value
        Oldurl = OldHREF.Replace("href=", "").Replace("HREF=", "").Replace("""", "")
        Dim NEWURL As New Uri(MainURL, Oldurl)
        Dim NewHREF As String = "href=""" & NEWURL.AbsoluteUri & """"
        NewSTR = NewSTR.Replace(OldHREF, NewHREF)
    Next
    html = NewSTR
    Dim XpSRC As New Regex("(src="".*?"")", RegexOptions.IgnoreCase)
    For i = 0 To XpSRC.Matches(html).Count - 1
        Application.DoEvents()
        Dim Oldurl As String = Nothing
        Dim OldHREF As String = Nothing
        Dim MainURL As New Uri(PageURL)
        OldHREF = XpSRC.Matches(html).Item(i).Value
        Oldurl = OldHREF.Replace("src=", "").Replace("src=", "").Replace("""", "")
        Dim NEWURL As New Uri(MainURL, Oldurl)
        Dim NewHREF As String = "src=""" & NEWURL.AbsoluteUri & """"
        NewSTR = NewSTR.Replace(OldHREF, NewHREF)
    Next
    Return NewSTR
End Function

This works great for me. I uses it on email templates. I'm using the MVC/Razor "~/" at the beginning of each link.

' Parse HTML and make relative links absolute with p_basepath
Public Function ParseHTMLLinks(ByVal MailBodyHTML As String) As String
    ' Declare & intialize variables
    Dim strHTMLBody As String = MailBodyHTML

    ' Set regex variables 
    Dim strSrcSubMatch As String = ""
    Dim strSrcFullUrl As String = ""
    Dim srcPattern As String = "[=""]\/?([^""\s]*(\.gif|\.jpg|\.jpeg|\.png|\.css|\.js))[""\s]"
    Dim srcOptions As RegexOptions = RegexOptions.IgnoreCase
    Dim regex As Regex = New Regex(srcPattern, srcOptions)
    Dim regexSub As Regex = New Regex(srcPattern, srcOptions)
    Dim Matches As MatchCollection = regex.Matches(strHTMLBody)

    Try
        For Each Match As Match In Matches
            ' filter out absolute links
            If InStr(Match.ToString, "://") = 0 And InStr(LCase(Match.ToString), "mailto:") = 0 And InStr(LCase(Match.ToString), "javascript:") = 0 Then
                ' Remove the " at each end of relative path
                strSrcSubMatch = regexSub.Replace(Match.ToString, "$1")
                ' Concatenate the FullPath
                strSrcFullUrl = p_basePath & strSrcSubMatch
                ' Execute the replace
                strHTMLBody = Replace(strHTMLBody, "/" & strSrcSubMatch, strSrcFullUrl)
            End If
        Next

    Catch e As WebException
        'Add errors to List(Of WebException), if any.
        ErrorCodes.Add(e)
    End Try

    Return strHTMLBody 'MailBodyHTML
End Function
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!