Is there a method around that tests if 2 URLs are equal, ie point to the same place. I am not talking about 2 URLs with different domain names pointing to the same IP addres
for the record, here is the translation of http://en.wikipedia.org/wiki/URL%5Fnormalization to C#:
using System;
using System.Web;
namespace UrlNormalizationTest
{
public static class UrlNormalization
{
public static bool AreTheSameUrls(this string url1, string url2)
{
url1 = url1.NormalizeUrl();
url2 = url2.NormalizeUrl();
return url1.Equals(url2);
}
public static bool AreTheSameUrls(this Uri uri1, Uri uri2)
{
var url1 = uri1.NormalizeUrl();
var url2 = uri2.NormalizeUrl();
return url1.Equals(url2);
}
public static string[] DefaultDirectoryIndexes = new[]
{
"default.asp",
"default.aspx",
"index.htm",
"index.html",
"index.php"
};
public static string NormalizeUrl(this Uri uri)
{
var url = urlToLower(uri);
url = limitProtocols(url);
url = removeDefaultDirectoryIndexes(url);
url = removeTheFragment(url);
url = removeDuplicateSlashes(url);
url = addWww(url);
url = removeFeedburnerPart(url);
return removeTrailingSlashAndEmptyQuery(url);
}
public static string NormalizeUrl(this string url)
{
return NormalizeUrl(new Uri(url));
}
private static string removeFeedburnerPart(string url)
{
var idx = url.IndexOf("utm_source=", StringComparison.Ordinal);
return idx == -1 ? url : url.Substring(0, idx - 1);
}
private static string addWww(string url)
{
if (new Uri(url).Host.Split('.').Length == 2 && !url.Contains("://www."))
{
return url.Replace("://", "://www.");
}
return url;
}
private static string removeDuplicateSlashes(string url)
{
var path = new Uri(url).AbsolutePath;
return path.Contains("//") ? url.Replace(path, path.Replace("//", "/")) : url;
}
private static string limitProtocols(string url)
{
return new Uri(url).Scheme == "https" ? url.Replace("https://", "http://") : url;
}
private static string removeTheFragment(string url)
{
var fragment = new Uri(url).Fragment;
return string.IsNullOrWhiteSpace(fragment) ? url : url.Replace(fragment, string.Empty);
}
private static string urlToLower(Uri uri)
{
return HttpUtility.UrlDecode(uri.AbsoluteUri.ToLowerInvariant());
}
private static string removeTrailingSlashAndEmptyQuery(string url)
{
return url
.TrimEnd(new[] { '?' })
.TrimEnd(new[] { '/' });
}
private static string removeDefaultDirectoryIndexes(string url)
{
foreach (var index in DefaultDirectoryIndexes)
{
if (url.EndsWith(index))
{
url = url.TrimEnd(index.ToCharArray());
break;
}
}
return url;
}
}
}
With the following tests:
using NUnit.Framework;
using UrlNormalizationTest;
namespace UrlNormalization.Tests
{
[TestFixture]
public class UnitTests
{
[Test]
public void Test1ConvertingTheSchemeAndHostToLowercase()
{
var url1 = "HTTP://www.Example.com/".NormalizeUrl();
var url2 = "http://www.example.com/".NormalizeUrl();
Assert.AreEqual(url1, url2);
}
[Test]
public void Test2CapitalizingLettersInEscapeSequences()
{
var url1 = "http://www.example.com/a%c2%b1b".NormalizeUrl();
var url2 = "http://www.example.com/a%C2%B1b".NormalizeUrl();
Assert.AreEqual(url1, url2);
}
[Test]
public void Test3DecodingPercentEncodedOctetsOfUnreservedCharacters()
{
var url1 = "http://www.example.com/%7Eusername/".NormalizeUrl();
var url2 = "http://www.example.com/~username/".NormalizeUrl();
Assert.AreEqual(url1, url2);
}
[Test]
public void Test4RemovingTheDefaultPort()
{
var url1 = "http://www.example.com:80/bar.html".NormalizeUrl();
var url2 = "http://www.example.com/bar.html".NormalizeUrl();
Assert.AreEqual(url1, url2);
}
[Test]
public void Test5AddingTrailing()
{
var url1 = "http://www.example.com/alice".NormalizeUrl();
var url2 = "http://www.example.com/alice/?".NormalizeUrl();
Assert.AreEqual(url1, url2);
}
[Test]
public void Test6RemovingDotSegments()
{
var url1 = "http://www.example.com/../a/b/../c/./d.html".NormalizeUrl();
var url2 = "http://www.example.com/a/c/d.html".NormalizeUrl();
Assert.AreEqual(url1, url2);
}
[Test]
public void Test7RemovingDirectoryIndex1()
{
var url1 = "http://www.example.com/default.asp".NormalizeUrl();
var url2 = "http://www.example.com/".NormalizeUrl();
Assert.AreEqual(url1, url2);
}
[Test]
public void Test7RemovingDirectoryIndex2()
{
var url1 = "http://www.example.com/default.asp?id=1".NormalizeUrl();
var url2 = "http://www.example.com/default.asp?id=1".NormalizeUrl();
Assert.AreEqual(url1, url2);
}
[Test]
public void Test7RemovingDirectoryIndex3()
{
var url1 = "http://www.example.com/a/index.html".NormalizeUrl();
var url2 = "http://www.example.com/a/".NormalizeUrl();
Assert.AreEqual(url1, url2);
}
[Test]
public void Test8RemovingTheFragment()
{
var url1 = "http://www.example.com/bar.html#section1".NormalizeUrl();
var url2 = "http://www.example.com/bar.html".NormalizeUrl();
Assert.AreEqual(url1, url2);
}
[Test]
public void Test9LimitingProtocols()
{
var url1 = "https://www.example.com/".NormalizeUrl();
var url2 = "http://www.example.com/".NormalizeUrl();
Assert.AreEqual(url1, url2);
}
[Test]
public void Test10RemovingDuplicateSlashes()
{
var url1 = "http://www.example.com/foo//bar.html".NormalizeUrl();
var url2 = "http://www.example.com/foo/bar.html".NormalizeUrl();
Assert.AreEqual(url1, url2);
}
[Test]
public void Test11AddWww()
{
var url1 = "http://example.com/".NormalizeUrl();
var url2 = "http://www.example.com".NormalizeUrl();
Assert.AreEqual(url1, url2);
}
[Test]
public void Test12RemoveFeedburnerPart()
{
var url1 = "http://site.net/2013/02/firefox-19-released/?utm_source=rss&utm_medium=rss&utm_campaign=firefox-19-released".NormalizeUrl();
var url2 = "http://site.net/2013/02/firefox-19-released".NormalizeUrl();
Assert.AreEqual(url1, url2);
}
}
}
Maybe this tutorial can be of help to you?
"...You want to see how to handle identical Urls in the sitemap (which is forbidden by the out-of-the-box SiteMapProvider)..."
/// <summary>
/// SiteMap datasources cannot have duplicate Urls with the default provider.
/// This finds duplicate urls in your heirarchy and tricks the provider into treating
/// them correctly
/// </summary>
private void modifyDuplicateUrls()
{
StringCollection urls = new StringCollection();
string rowUrl = String.Empty;
uint duplicateCounter = 0;
string urlModifier = String.Empty;
foreach (DataTable dt in this.DataSource.Tables)
{
foreach (DataRow dr in dt.Rows)
{
rowUrl = (string)dr["Url"];
if (urls.Contains(rowUrl))
{
duplicateCounter++;
if (rowUrl.Contains("?"))
{
urlModifier = "&instance=" + duplicateCounter.ToString();
}
else
{
urlModifier = "?instance=" + duplicateCounter.ToString();
}
dr["Url"] = rowUrl + urlModifier;
}
else
{
urls.Add(rowUrl);
}
}
}
}
}
You might be looking for URL normalization techniques. They might be a good starting point :)
Once you have normalized the URLs, you simply need to check if they are equal (keep in mind your assumptions, for instance, you discard the querystring).
frankly, just load the URLs and compare their html contents?
You could probably use the Uri
class to check individual parts of the urls, after converting each to the right format.
// Create the URI objects
// TODO: Use the right constructor overloads,
// or do some processing beforehand to accomodate for the different scenarios
Uri uri1 = new Uri(url1);
Uri uri2 = new Uri(url2);
// There are overlaods for the constructor too
Uri uri3 = new Uri(url3, UriKind.Absolute);
// Check the correct properties
// TODO: Use the right properties...
if (uri1.AbsolutePath == uri2.AbsolutePath)
{
// Urls match
}
What about seeing if Server.MapPath is equal for both urls? (assuming this is an ASP.NET application, not ASP.NET MVC)
if (Server.MapPath(url1).ToLower() == Server.MapPath(url2).ToLower())
{
return true;
}
else
{
return false;
}