Given the URL (single line):
http://test.example.com/dir/subdir/file.html
How can I extract the following parts using regular expressions:
This improved version should work as reliably as a parser.
// Applies to URI, not just URL or URN:
// http://en.wikipedia.org/wiki/Uniform_Resource_Identifier#Relationship_to_URL_and_URN
//
// http://labs.apache.org/webarch/uri/rfc/rfc3986.html#regexp
//
// (?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?
//
// http://en.wikipedia.org/wiki/URI_scheme#Generic_syntax
//
// $@ matches the entire uri
// $1 matches scheme (ftp, http, mailto, mshelp, ymsgr, etc)
// $2 matches authority (host, user:pwd@host, etc)
// $3 matches path
// $4 matches query (http GET REST api, etc)
// $5 matches fragment (html anchor, etc)
//
// Match specific schemes, non-optional authority, disallow white-space so can delimit in text, and allow 'www.' w/o scheme
// Note the schemes must match ^[^\s|:/?#]+(?:\|[^\s|:/?#]+)*$
//
// (?:()(www\.[^\s/?#]+\.[^\s/?#]+)|(schemes)://([^\s/?#]*))([^\s?#]*)(?:\?([^\s#]*))?(#(\S*))?
//
// Validate the authority with an orthogonal RegExp, so the RegExp above won’t fail to match any valid urls.
function uriRegExp( flags, schemes/* = null*/, noSubMatches/* = false*/ )
{
if( !schemes )
schemes = '[^\\s:\/?#]+'
else if( !RegExp( /^[^\s|:\/?#]+(?:\|[^\s|:\/?#]+)*$/ ).test( schemes ) )
throw TypeError( 'expected URI schemes' )
return noSubMatches ? new RegExp( '(?:www\\.[^\\s/?#]+\\.[^\\s/?#]+|' + schemes + '://[^\\s/?#]*)[^\\s?#]*(?:\\?[^\\s#]*)?(?:#\\S*)?', flags ) :
new RegExp( '(?:()(www\\.[^\\s/?#]+\\.[^\\s/?#]+)|(' + schemes + ')://([^\\s/?#]*))([^\\s?#]*)(?:\\?([^\\s#]*))?(?:#(\\S*))?', flags )
}
// http://en.wikipedia.org/wiki/URI_scheme#Official_IANA-registered_schemes
function uriSchemesRegExp()
{
return 'about|callto|ftp|gtalk|http|https|irc|ircs|javascript|mailto|mshelp|sftp|ssh|steam|tel|view-source|ymsgr'
}