Best way to convert a Unicode URL to ASCII (UTF-8 percent-escaped) in Python?

前端 未结 5 1512
情话喂你
情话喂你 2020-12-12 13:30

I\'m wondering what\'s the best way -- or if there\'s a simple way with the standard library -- to convert a URL with Unicode chars in the domain name and path to the equiva

5条回答
  •  無奈伤痛
    2020-12-12 14:07

    Okay, with these comments and some bug-fixing in my own code (it didn't handle fragments at all), I've come up with the following canonurl() function -- returns a canonical, ASCII form of the URL:

    import re
    import urllib
    import urlparse
    
    def canonurl(url):
        r"""Return the canonical, ASCII-encoded form of a UTF-8 encoded URL, or ''
        if the URL looks invalid.
    
        >>> canonurl('    ')
        ''
        >>> canonurl('www.google.com')
        'http://www.google.com/'
        >>> canonurl('bad-utf8.com/path\xff/file')
        ''
        >>> canonurl('svn://blah.com/path/file')
        'svn://blah.com/path/file'
        >>> canonurl('1234://badscheme.com')
        ''
        >>> canonurl('bad$scheme://google.com')
        ''
        >>> canonurl('site.badtopleveldomain')
        ''
        >>> canonurl('site.com:badport')
        ''
        >>> canonurl('http://123.24.8.240/blah')
        'http://123.24.8.240/blah'
        >>> canonurl('http://123.24.8.240:1234/blah?q#f')
        'http://123.24.8.240:1234/blah?q#f'
        >>> canonurl('\xe2\x9e\xa1.ws')  # tinyarro.ws
        'http://xn--hgi.ws/'
        >>> canonurl('  http://www.google.com:80/path/file;params?query#fragment  ')
        'http://www.google.com:80/path/file;params?query#fragment'
        >>> canonurl('http://\xe2\x9e\xa1.ws/\xe2\x99\xa5')
        'http://xn--hgi.ws/%E2%99%A5'
        >>> canonurl('http://\xe2\x9e\xa1.ws/\xe2\x99\xa5/pa%2Fth')
        'http://xn--hgi.ws/%E2%99%A5/pa/th'
        >>> canonurl('http://\xe2\x9e\xa1.ws/\xe2\x99\xa5/pa%2Fth;par%2Fams?que%2Fry=a&b=c')
        'http://xn--hgi.ws/%E2%99%A5/pa/th;par/ams?que/ry=a&b=c'
        >>> canonurl('http://\xe2\x9e\xa1.ws/\xe2\x99\xa5?\xe2\x99\xa5#\xe2\x99\xa5')
        'http://xn--hgi.ws/%E2%99%A5?%E2%99%A5#%E2%99%A5'
        >>> canonurl('http://\xe2\x9e\xa1.ws/%e2%99%a5?%E2%99%A5#%E2%99%A5')
        'http://xn--hgi.ws/%E2%99%A5?%E2%99%A5#%E2%99%A5'
        >>> canonurl('http://badutf8pcokay.com/%FF?%FE#%FF')
        'http://badutf8pcokay.com/%FF?%FE#%FF'
        >>> len(canonurl('google.com/' + 'a' * 16384))
        4096
        """
        # strip spaces at the ends and ensure it's prefixed with 'scheme://'
        url = url.strip()
        if not url:
            return ''
        if not urlparse.urlsplit(url).scheme:
            url = 'http://' + url
    
        # turn it into Unicode
        try:
            url = unicode(url, 'utf-8')
        except UnicodeDecodeError:
            return ''  # bad UTF-8 chars in URL
    
        # parse the URL into its components
        parsed = urlparse.urlsplit(url)
        scheme, netloc, path, query, fragment = parsed
    
        # ensure scheme is a letter followed by letters, digits, and '+-.' chars
        if not re.match(r'[a-z][-+.a-z0-9]*$', scheme, flags=re.I):
            return ''
        scheme = str(scheme)
    
        # ensure domain and port are valid, eg: sub.domain.<1-to-6-TLD-chars>[:port]
        match = re.match(r'(.+\.[a-z0-9]{1,6})(:\d{1,5})?$', netloc, flags=re.I)
        if not match:
            return ''
        domain, port = match.groups()
        netloc = domain + (port if port else '')
        netloc = netloc.encode('idna')
    
        # ensure path is valid and convert Unicode chars to %-encoded
        if not path:
            path = '/'  # eg: 'http://google.com' -> 'http://google.com/'
        path = urllib.quote(urllib.unquote(path.encode('utf-8')), safe='/;')
    
        # ensure query is valid
        query = urllib.quote(urllib.unquote(query.encode('utf-8')), safe='=&?/')
    
        # ensure fragment is valid
        fragment = urllib.quote(urllib.unquote(fragment.encode('utf-8')))
    
        # piece it all back together, truncating it to a maximum of 4KB
        url = urlparse.urlunsplit((scheme, netloc, path, query, fragment))
        return url[:4096]
    
    if __name__ == '__main__':
        import doctest
        doctest.testmod()
    

提交回复
热议问题