Extracting rootdomains from URL string in Google Sheets

后端 未结 2 1236
时光取名叫无心
时光取名叫无心 2020-12-31 09:19

Hi I am trying to extract the rootdomain from URL string in Google Sheets. I know how to get the domain and I have the formula to remove www. but now I realize

相关标签:
2条回答
  • 2020-12-31 09:28

    I think that a most reliable way is to check over TLD list because of TLDs like co.uk, gov.uk and so on that are impossible to extract via a simple regex.

    You can define these functions in Tools -> Script editor

    function endsWith(str, searchString) {
        position = str.length - searchString.length;
        var lastIndex = str.lastIndexOf(searchString);
        return lastIndex !== -1 && lastIndex === position;
    }
    
    function rawToTlds(raw) {
        var letter = new RegExp(/^\w/);
        return raw.split(/\n/).filter(function (t) { return letter.test(t) })
    }
    
    function getTlds() {
        var cacheName = 'TLDs'
        var cache = CacheService.getScriptCache();
        var list = cache.get(cacheName);
        if (list != null) {
            return list.split(',')
        }
    
        var raw = UrlFetchApp.fetch('https://publicsuffix.org/list/public_suffix_list.dat').getContentText()
        var tlds = rawToTlds(raw)
        cache.put(cacheName, tlds.join(), 60000)
        return tlds
    }
    
    function getDomainName(url, level) {
        var tlds = getTlds()
    
        var domain = url
            .replace(/^http(s)?:\/\//i, "")
            .replace(/^www\./i, "")
            .replace(/\/.*$/, "");
    
        if (typeof level === 'undefined') {
            return domain
        }
    
        var result = domain
        var longest = 0
        for (i in tlds) {
            var tld = '.' + tlds[i]
            if (endsWith(domain, tld) && tld.length > longest) {
                var parts = domain.substring(0, domain.length - tld.length).split('.')
                result = parts.slice(parts.length-level+1, parts.length).join('.') + tld
                longest = tld.length
            }
        }
    
        return result
    }
    

    To get second-level domian of A1 use it like this

    =getDomainName(A1, 2)
    

    To get full domain of A1 just do

    =getDomainName(A1)
    
    0 讨论(0)
  • 2020-12-31 09:43

    Currently using:

    =trim(REGEXEXTRACT(REGEXREPLACE(REGEXREPLACE(A2;"https?://";"");"^(w{3}\.)?";"")&"/";"([^/?]+)"))

    Seems to work fine

    Updated:7-7-2016

    (thanks for all the help!)

    0 讨论(0)
提交回复
热议问题