Hi I am trying to extract the rootdomain from URL string in Google Sheets. I know how to get the domain and I have the formula to remove www.
but now I realize
I think that a most reliable way is to check over TLD list because of TLDs like co.uk, gov.uk and so on that are impossible to extract via a simple regex.
You can define these functions in Tools -> Script editor
function endsWith(str, searchString) {
position = str.length - searchString.length;
var lastIndex = str.lastIndexOf(searchString);
return lastIndex !== -1 && lastIndex === position;
}
function rawToTlds(raw) {
var letter = new RegExp(/^\w/);
return raw.split(/\n/).filter(function (t) { return letter.test(t) })
}
function getTlds() {
var cacheName = 'TLDs'
var cache = CacheService.getScriptCache();
var list = cache.get(cacheName);
if (list != null) {
return list.split(',')
}
var raw = UrlFetchApp.fetch('https://publicsuffix.org/list/public_suffix_list.dat').getContentText()
var tlds = rawToTlds(raw)
cache.put(cacheName, tlds.join(), 60000)
return tlds
}
function getDomainName(url, level) {
var tlds = getTlds()
var domain = url
.replace(/^http(s)?:\/\//i, "")
.replace(/^www\./i, "")
.replace(/\/.*$/, "");
if (typeof level === 'undefined') {
return domain
}
var result = domain
var longest = 0
for (i in tlds) {
var tld = '.' + tlds[i]
if (endsWith(domain, tld) && tld.length > longest) {
var parts = domain.substring(0, domain.length - tld.length).split('.')
result = parts.slice(parts.length-level+1, parts.length).join('.') + tld
longest = tld.length
}
}
return result
}
To get second-level domian of A1 use it like this
=getDomainName(A1, 2)
To get full domain of A1 just do
=getDomainName(A1)
Currently using:
=trim(REGEXEXTRACT(REGEXREPLACE(REGEXREPLACE(A2;"https?://";"");"^(w{3}\.)?";"")&"/";"([^/?]+)"))
Seems to work fine
Updated:7-7-2016
(thanks for all the help!)