Sunday, November 8, 2015
private string ReplaceElements(string html) { string patten = @"<\s*div[^>]*>(.*?)<\s*/div\s*>"; string wrapper = @"{0}"; MatchCollection collection = Regex.Matches(html, patten); foreach (Match match in collection) { string value = match.Value; int marker = value.IndexOf(">"); string innterHtml = value.Substring(marker + 1, value.Length - (marker + 7)); if (Regex.Match(innterHtml, patten).Success) innterHtml = this.ReplaceElements(innterHtml); string wrappedText = string.Format(wrapper, innterHtml); string modifiedValue = value.Replace(innterHtml, wrappedText); html = html.Replace(value, modifiedValue); } return html; }
html = this.ReplaceElements(html);
Extract all text contents from webpage
public string scrapeWebsite(string url)
string extractedContent = "";
WebClient wc = new WebClient();
wc.Headers.Add("HTTP_USER_AGENT", "Web-Scraper-Agent (your-custom-user-agent-here)");
// Download the web page content from the URL
extractedContent = wc.DownloadString(url);
//Remove CSS styles, if any found
extractedContent = Regex.Replace(extractedContent, "", "");
//Remove script blocks
extractedContent = Regex.Replace(extractedContent, "", "");
// Remove all images
extractedContent = Regex.Replace(extractedContent, "
", "");
// Remove all HTML tags, leaving on the text inside.
extractedContent = Regex.Replace(extractedContent, "<(.| )*?>", "");
// Remove all extra spaces, tabs and repeated line-breaks
extractedContent = Regex.Replace(extractedContent, "(x09)?", "");
extractedContent = Regex.Replace(extractedContent, "(x20){2,}", " ");
extractedContent = Regex.Replace(extractedContent, "(x0Dx0A)+", " ");
catch (Exception e)
extractedContent = "Error on downloading: " + e.Message;
return extractedContent;
Subscribe to:
Post Comments (Atom)