public static string ClearHtml(string content) {
Regex regex = new Regex("");
//首先把p标签的属性去掉,只留<p>
regex = new Regex(@"<p.*?>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
content = regex.Replace(content, "<p>");
//找到网页中的各种标签,留待后续处理
regex = new Regex(@"<[/]*(?<txt>.*?)[\s>]", RegexOptions.Singleline | RegexOptions.IgnoreCase);
List<string> labels = new List<string>();
MatchCollection mclabels = regex.Matches(content);
foreach (Match m in mclabels) {
if(labels.Contains(m.Groups["txt"].Value) == false)
labels.Add(m.Groups["txt"].Value.ToLower());
}
//对各种标签进行替换,p、img、strong除外。br后面会进行单独处理
foreach (string lable in labels) {
if (lable=="p" || lable == "img" || lable=="strong" || lable=="br")
continue;
regex = new Regex(@"<[\/]*" + lable + ".*?>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
content = regex.Replace(content, "");
}
MatchCollection mc = null;
regex = new Regex(@"<img.*?src\s*?=\s*?['""](?<txt>.*?)['""].*?>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
mc = regex.Matches(content);
foreach (Match m in mc) {
content = content.Replace(m.Value,"<img src='"+ m.Groups["txt"].Value+"' />");
}
Regex r = new Regex(@"<br.*?>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
content = r.Replace(content, "\r\n");
r = new Regex(@"[\r\n\t]", RegexOptions.IgnoreCase | RegexOptions.Singleline);
content = r.Replace(content, "</p><p>");
content = content.Trim();
if (content.StartsWith("</p>") == true)
content = content.Substring(4);
if (content.EndsWith("<p>") == true)
content = content.Remove(content.Length - 3);
//替换段前空格开始
regex = new Regex(@"<p>\s* ", RegexOptions.IgnoreCase | RegexOptions.Singleline);
while (regex.IsMatch(content))
{
content = regex.Replace(content, @"<p>");
}
regex = new Regex(@"<p>\s+", RegexOptions.IgnoreCase | RegexOptions.Singleline);
while (regex.IsMatch(content))
{
content = regex.Replace(content, @"<p>");
}
regex = new Regex(@"<p> +", RegexOptions.IgnoreCase | RegexOptions.Singleline);
while (regex.IsMatch(content))
{
content = regex.Replace(content, @"<p>");
}
//替换段前空格结束
//替换p标签空嵌套的情况
regex = new Regex(@"<p>\s*?<p>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
while (regex.IsMatch(content))
content = regex.Replace(content, "<p>");
regex = new Regex(@"<\/p>\s*?<\/p>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
while (regex.IsMatch(content))
content = regex.Replace(content, @"</p>");
//替换p标签内容为空的情况
regex = new Regex(@"<p>(?<txt>.*?)</p>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
mc = regex.Matches(content);
foreach (Match m in mc) {
string value = m.Groups["txt"].Value;
value = value.Replace(" ", "").Trim();
if (string.IsNullOrEmpty(value) == true)
content = content.Replace(m.Value,"");
}
//段首加空格
content = content.Replace("<p>", "<p> ");
return content;
}
剔除了除p、img、strong之外的其他标签,对p、img的各种属性也进行了清除,专门用于生成干净的网页正文,可用于信息采集后的内容整理和格式化排版。自用代码,算法效率可能不高,但是足以满足目前需求了。
来源:https://www.cnblogs.com/theluther/p/4762435.html