Thursday, April 28, 2011

Regular expressions

Sometimes I required to do some tasks using regular expression. In this post I decided to collect my regular expressions:

First regular expression is to find images in the html and replace its src url:







static string ReplaceTag(string HTMLBody, string srcPath, string newPath)
{
Regex reImg = new Regex(@"]*>", RegexOptions.IgnoreCase);
Regex reHeight = new Regex(@"height=(?:(['""])(?(?:(?!\1).)*)\1(?[^\s>]+))", RegexOptions.IgnoreCase RegexOptions.Singleline);
Regex reWidth = new Regex(@"width=(?:(['""])(?(?:(?!\1).)*)\1(?[^\s>]+))", RegexOptions.IgnoreCase RegexOptions.Singleline);
Regex reSrc = new Regex(@"src=(?:(['""])(?(?:(?!\1).)*)\1(?[^\s>]+))", RegexOptions.IgnoreCase RegexOptions.Singleline);
string tmpHTMLBody = HTMLBody;
MatchCollection mc = reImg.Matches(HTMLBody);
foreach (Match mImg in mc)
{
Console.WriteLine(" img tag: {0}", mImg.Groups[0].Value);
string tmpImgTag = string.Empty;
string tmpOldImgSrc = string.Empty;
string tmpNewImgSrc = string.Empty;
tmpImgTag = mImg.Groups[0].Value;

if (reHeight.IsMatch(mImg.Groups[0].Value))
{
Match mHeight = reHeight.Match(mImg.Groups[0].Value);
Console.WriteLine(" height is: {0}", mHeight.Groups["height"].Value);
}
if (reWidth.IsMatch(mImg.Groups[0].Value))
{
Match mWidth = reWidth.Match(mImg.Groups[0].Value);
Console.WriteLine(" width is: {0}", mWidth.Groups["width"].Value);
}
if (reHeight.IsMatch(mImg.Groups[0].Value))
{
Match mSrc = reSrc.Match(mImg.Groups[0].Value);
tmpOldImgSrc = mSrc.Groups["src"].Value;
tmpNewImgSrc = tmpOldImgSrc.ToLower().Replace(srcPath.ToLower(), newPath.ToLower());
tmpHTMLBody = tmpHTMLBody.ToLower().Replace(tmpOldImgSrc.ToLower(), tmpNewImgSrc.ToLower());
Console.WriteLine(" src is: {0}", mSrc.Groups["src"].Value);
}
}
return tmpHTMLBody;
}

This Regular Expression to Clean Word to Html remove word classes and attributes:







static internal string CleanWordHtml(string html)
{
// start by completely removing all unwanted tags
html = Regex.Replace(html, @"<[/]?(fonth1h2h3h4h5h6bspanxmldelins[ovwxp]:\w+)[^>]*?>", "", RegexOptions.IgnoreCase);
// then run another pass over the html (twice), removing unwanted attributes
html = Regex.Replace(html, @"<([^>]*)(?:classlangstylesizeface[ovwxp]:\w+)=(?:'[^']*'""[^""]*""[^\s>]+)([^>]*)>", "<$1$2>", RegexOptions.IgnoreCase);
html = Regex.Replace(html, @"<([^>]*)(?:classlangstylesizeface[ovwxp]:\w+)=(?:'[^']*'""[^""]*""[^\s>]+)([^>]*)>", "<$1$2>", RegexOptions.IgnoreCase);
return html;
}

This Regular Expression to remove tags from Html:







static internal string StripHtml(string html, bool allowHarmlessTags)
{
if (html == null html == string.Empty)
return string.Empty;

if (allowHarmlessTags)
return System.Text.RegularExpressions.Regex.Replace(html, "", string.Empty);

string strippedHtml = System.Text.RegularExpressions.Regex.Replace(html, "<[^>]*>", string.Empty);
strippedHtml = HttpUtility.HtmlDecode(strippedHtml);
return strippedHtml;
}


Check that a string contains Arabic Characters using C#:





static internal bool hasArabic(string text)
{
Regex regex = new Regex(
"\\p{IsArabic}");
return regex.IsMatch(text);
}

No comments: