How do I get specific path sections from a url? For example, I want a function which operates on this:
http://www.mydomain.com/hithere?image=2934
import urlparse
output = urlparse.urlparse('http://www.example.com/temp/something/happen/index.html').path
output
'/temp/something/happen/index.html'
Split the path -- inbuilt rpartition func of string
output.rpartition('/')[0]
'/temp/something/happen'
The best option is to use the posixpath module when working with the path component of URLs. This module has the same interface as os.path and consistently operates on POSIX paths when used on POSIX and Windows NT based platforms.
Sample Code:
#!/usr/bin/env python3
import urllib.parse
import sys
import posixpath
import ntpath
import json
def path_parse( path_string, *, normalize = True, module = posixpath ):
result = []
if normalize:
tmp = module.normpath( path_string )
else:
tmp = path_string
while tmp != "/":
( tmp, item ) = module.split( tmp )
result.insert( 0, item )
return result
def dump_array( array ):
string = "[ "
for index, item in enumerate( array ):
if index > 0:
string += ", "
string += "\"{}\"".format( item )
string += " ]"
return string
def test_url( url, *, normalize = True, module = posixpath ):
url_parsed = urllib.parse.urlparse( url )
path_parsed = path_parse( urllib.parse.unquote( url_parsed.path ),
normalize=normalize, module=module )
sys.stdout.write( "{}\n --[n={},m={}]-->\n {}\n".format(
url, normalize, module.__name__, dump_array( path_parsed ) ) )
test_url( "http://eg.com/hithere/something/else" )
test_url( "http://eg.com/hithere/something/else/" )
test_url( "http://eg.com/hithere/something/else/", normalize = False )
test_url( "http://eg.com/hithere/../else" )
test_url( "http://eg.com/hithere/../else", normalize = False )
test_url( "http://eg.com/hithere/../../else" )
test_url( "http://eg.com/hithere/../../else", normalize = False )
test_url( "http://eg.com/hithere/something/./else" )
test_url( "http://eg.com/hithere/something/./else", normalize = False )
test_url( "http://eg.com/hithere/something/./else/./" )
test_url( "http://eg.com/hithere/something/./else/./", normalize = False )
test_url( "http://eg.com/see%5C/if%5C/this%5C/works", normalize = False )
test_url( "http://eg.com/see%5C/if%5C/this%5C/works", normalize = False,
module = ntpath )
Code output:
http://eg.com/hithere/something/else
--[n=True,m=posixpath]-->
[ "hithere", "something", "else" ]
http://eg.com/hithere/something/else/
--[n=True,m=posixpath]-->
[ "hithere", "something", "else" ]
http://eg.com/hithere/something/else/
--[n=False,m=posixpath]-->
[ "hithere", "something", "else", "" ]
http://eg.com/hithere/../else
--[n=True,m=posixpath]-->
[ "else" ]
http://eg.com/hithere/../else
--[n=False,m=posixpath]-->
[ "hithere", "..", "else" ]
http://eg.com/hithere/../../else
--[n=True,m=posixpath]-->
[ "else" ]
http://eg.com/hithere/../../else
--[n=False,m=posixpath]-->
[ "hithere", "..", "..", "else" ]
http://eg.com/hithere/something/./else
--[n=True,m=posixpath]-->
[ "hithere", "something", "else" ]
http://eg.com/hithere/something/./else
--[n=False,m=posixpath]-->
[ "hithere", "something", ".", "else" ]
http://eg.com/hithere/something/./else/./
--[n=True,m=posixpath]-->
[ "hithere", "something", "else" ]
http://eg.com/hithere/something/./else/./
--[n=False,m=posixpath]-->
[ "hithere", "something", ".", "else", ".", "" ]
http://eg.com/see%5C/if%5C/this%5C/works
--[n=False,m=posixpath]-->
[ "see\", "if\", "this\", "works" ]
http://eg.com/see%5C/if%5C/this%5C/works
--[n=False,m=ntpath]-->
[ "see", "if", "this", "works" ]
Notes:
\
) correctly (see last two cases in code/output) - which is why posixpath is recommended./
) is not defined by RFC 3986. However, posixpath collapses multiple adjacent path separators (i.e. it treats ///
, //
and /
the same)Normative References:
Python 3.4+ solution:
from urllib.parse import unquote, urlparse
from pathlib import PurePosixPath
url = 'http://www.example.com/hithere/something/else'
PurePosixPath(
unquote(
urlparse(
url
).path
)
).parts[1]
# returns 'hithere' (the same for the URL with parameters)
# parts holds ('/', 'hithere', 'something', 'else')
# 0 1 2 3
Note in Python3 import has changed to from urllib.parse import urlparse
See documentation. Here is an example:
>>> from urllib.parse import urlparse
>>> url = 's3://bucket.test/my/file/directory'
>>> p = urlparse(url)
>>> p
ParseResult(scheme='s3', netloc='bucket.test', path='/my/file/directory', params='', query='', fragment='')
>>> p.scheme
's3'
>>> p.netloc
'bucket.test'
>>> p.path
'/my/file/directory'
A combination of urlparse and os.path.split will do the trick. The following script stores all sections of a url in a list, backwards.
import os.path, urlparse
def generate_sections_of_url(url):
path = urlparse.urlparse(url).path
sections = []; temp = "";
while path != '/':
temp = os.path.split(path)
path = temp[0]
sections.append(temp[1])
return sections
This would return: ["else", "something", "hithere"]
Here is an example using urlparse and rpartition.
# Python 2x:
from urlparse import urlparse
# Python 3x:
from urllib.parse import urlparse
def printPathTokens(full_url):
print('printPathTokens() called: %s' % full_url)
p_full = urlparse(full_url).path
print(' . p_full url: %s' % p_full)
# Split the path using rpartition method of string
# rpartition "returns a tuple containing the part the before separator,
# argument string and the part after the separator"
(rp_left, rp_match, rp_right) = p_full.rpartition('/')
if rp_match == '': # returns the rpartition separator if found
print(' . No slashes found in path')
else:
print(' . path to last resource: %s' % rp_left)
if rp_right == '': # Ended with a slash
print(' . last resource: (none)')
else:
print(' . last resource: %s' % (rp_right))
printPathTokens('http://www.example.com/temp/something/happen/index.html')
# Output:
# printPathTokens() called: http://www.example.com/temp/something/happen/index.html
# . p_full url: /temp/something/happen/index.html
# . path to last resource: /temp/something/happen
# . last resource: index.html
printPathTokens('http://www.example.com/temp/something/happen/')
# Output:
# printPathTokens() called: http://www.example.com/temp/something/happen/
# . p_full url: /temp/something/happen/
# . path to last resource: /temp/something/happen
# . last resource: (none)
printPathTokens('http://www.example.com/temp/something/happen')
# Output:
# printPathTokens() called: http://www.example.com/temp/something/happen
# . p_full url: /temp/something/happen
# . path to last resource: /temp/something
# . last resource: happen