How do I filter all HTML tags except a certain whitelist?

后端 未结 8 1552
一整个雨季
一整个雨季 2020-11-27 12:07

This is for .NET. IgnoreCase is set and MultiLine is NOT set.

Usually I\'m decent at regex, maybe I\'m running low on caffeine...

Users are allowed to enter

8条回答
  •  时光取名叫无心
    2020-11-27 12:43

        /// 
        /// Trims the ignoring spacified tags
        /// 
        /// the text from which html is to be removed
        /// specify if you want to remove scripts
        /// specify the tags that are to be ignored while stripping
        /// Stripped Text
        public static string StripHtml(string text, bool isRemoveScript, params string[] ignorableTags)
        {
            if (!string.IsNullOrEmpty(text))
            {
                text = text.Replace("<", "<");
                text = text.Replace(">", ">");
                string ignorePattern = null;
    
                if (isRemoveScript)
                {
                    text = Regex.Replace(text, "", string.Empty, RegexOptions.IgnoreCase);
                }
                if (!ignorableTags.Contains("style"))
                {
                    text = Regex.Replace(text, "", string.Empty, RegexOptions.IgnoreCase);
                }
                foreach (string tag in ignorableTags)
                {
                    //the character b spoils the regex so replace it with strong
                    if (tag.Equals("b"))
                    {
                        text = text.Replace("", "");
                        text = text.Replace("", "");
                        if (ignorableTags.Contains("strong"))
                        {
                            ignorePattern = string.Format("{0}(?!strong)(?!/strong)", ignorePattern);
                        }
                    }
                    else
                    {
                        //Create ignore pattern fo the tags to ignore
                        ignorePattern = string.Format("{0}(?!{1})(?!/{1})", ignorePattern, tag);
                    }
    
                }
                //finally add the ignore pattern into regex <[^<]*> which is used to match all html tags
                ignorePattern = string.Format(@"<{0}[^<]*>", ignorePattern);
                text = Regex.Replace(text, ignorePattern, "", RegexOptions.IgnoreCase);
            }
    
            return text;
        }
    

提交回复
热议问题