Ich benutze diese Funktion seit einer Weile. Entfernt so ziemlich jedes unordentliche HTML, das Sie darauf werfen können, und lässt den Text intakt.
private static readonly Regex _tags_ = new Regex(@"<[^>]+?>", RegexOptions.Multiline | RegexOptions.Compiled);
//add characters that are should not be removed to this regex
private static readonly Regex _notOkCharacter_ = new Regex(@"[^\w;&#@.:/\\?=|%!() -]", RegexOptions.Compiled);
public static String UnHtml(String html)
html = HttpUtility.UrlDecode(html);
html = HttpUtility.HtmlDecode(html);
html = RemoveTag(html, "<!--", "-->");
html = RemoveTag(html, "<script", "</script>");
html = RemoveTag(html, "<style", "</style>");
//replace matches of these regexes with space
html = _tags_.Replace(html, " ");
html = _notOkCharacter_.Replace(html, " ");
html = SingleSpacedTrim(html);
return html;
private static String RemoveTag(String html, String startTag, String endTag)
Boolean bAgain;
bAgain = false;
Int32 startTagPos = html.IndexOf(startTag, 0, StringComparison.CurrentCultureIgnoreCase);
if (startTagPos < 0)
Int32 endTagPos = html.IndexOf(endTag, startTagPos + 1, StringComparison.CurrentCultureIgnoreCase);
if (endTagPos <= startTagPos)
html = html.Remove(startTagPos, endTagPos - startTagPos + endTag.Length);
bAgain = true;
} while (bAgain);
return html;
private static String SingleSpacedTrim(String inString)
StringBuilder sb = new StringBuilder();
Boolean inBlanks = false;
foreach (Char c in inString)
switch (c)
case '\r':
case '\n':
case '\t':
case ' ':
if (!inBlanks)
inBlanks = true;
sb.Append(' ');
inBlanks = false;
return sb.ToString().Trim();