First regular expression is to find images in the html and replace its src url:
static string ReplaceTag(string HTMLBody, string srcPath, string newPath) { Regex reImg = new Regex(@"]*>", RegexOptions.IgnoreCase); Regex reHeight = new Regex(@"height=(?:(['""])(? Regex reWidth = new Regex(@"width=(?:(['""])(? Regex reSrc = new Regex(@"src=(?:(['""])(? string tmpHTMLBody = HTMLBody; MatchCollection mc = reImg.Matches(HTMLBody); foreach (Match mImg in mc) { Console.WriteLine(" img tag: {0}", mImg.Groups[0].Value); string tmpImgTag = string.Empty; string tmpOldImgSrc = string.Empty; string tmpNewImgSrc = string.Empty; tmpImgTag = mImg.Groups[0].Value; if (reHeight.IsMatch(mImg.Groups[0].Value)) { Match mHeight = reHeight.Match(mImg.Groups[0].Value); Console.WriteLine(" height is: {0}", mHeight.Groups["height"].Value); } if (reWidth.IsMatch(mImg.Groups[0].Value)) { Match mWidth = reWidth.Match(mImg.Groups[0].Value); Console.WriteLine(" width is: {0}", mWidth.Groups["width"].Value); } if (reHeight.IsMatch(mImg.Groups[0].Value)) { Match mSrc = reSrc.Match(mImg.Groups[0].Value); tmpOldImgSrc = mSrc.Groups["src"].Value; tmpNewImgSrc = tmpOldImgSrc.ToLower().Replace(srcPath.ToLower(), newPath.ToLower()); tmpHTMLBody = tmpHTMLBody.ToLower().Replace(tmpOldImgSrc.ToLower(), tmpNewImgSrc.ToLower()); Console.WriteLine(" src is: {0}", mSrc.Groups["src"].Value); } } return tmpHTMLBody; } |
This Regular Expression to Clean Word to Html remove word classes and attributes:
static internal string CleanWordHtml(string html) { // start by completely removing all unwanted tags html = Regex.Replace(html, @"<[/]?(fonth1h2h3h4h5h6bspanxmldelins[ovwxp]:\w+)[^>]*?>", "", RegexOptions.IgnoreCase); // then run another pass over the html (twice), removing unwanted attributes html = Regex.Replace(html, @"<([^>]*)(?:classlangstylesizeface[ovwxp]:\w+)=(?:'[^']*'""[^""]*""[^\s>]+)([^>]*)>", "<$1$2>", RegexOptions.IgnoreCase); html = Regex.Replace(html, @"<([^>]*)(?:classlangstylesizeface[ovwxp]:\w+)=(?:'[^']*'""[^""]*""[^\s>]+)([^>]*)>", "<$1$2>", RegexOptions.IgnoreCase); return html; } |
This Regular Expression to remove tags from Html:
static internal string StripHtml(string html, bool allowHarmlessTags) { if (html == null html == string.Empty) return string.Empty; if (allowHarmlessTags) return System.Text.RegularExpressions.Regex.Replace(html, "", string.Empty); string strippedHtml = System.Text.RegularExpressions.Regex.Replace(html, "<[^>]*>", string.Empty); strippedHtml = HttpUtility.HtmlDecode(strippedHtml); return strippedHtml; } |
Check that a string contains Arabic Characters using C#:
static internal bool hasArabic(string text) { Regex regex = new Regex( "\\p{IsArabic}"); return regex.IsMatch(text); } |
No comments:
Post a Comment