Sunday, November 8, 2015

private string ReplaceElements(string html)
{
    string patten = @"<\s*div[^>]*>(.*?)<\s*/div\s*>";
    string wrapper = @"
{0}
"
;     MatchCollection collection = Regex.Matches(html, patten);     foreach (Match match in collection)     {         string value = match.Value;         int marker = value.IndexOf(">");         string innterHtml = value.Substring(marker + 1, value.Length - (marker + 7));         if (Regex.Match(innterHtml, patten).Success)             innterHtml = this.ReplaceElements(innterHtml);         string wrappedText = string.Format(wrapper, innterHtml);         string modifiedValue = value.Replace(innterHtml, wrappedText);         html = html.Replace(value, modifiedValue);     }     return html; }  
use: 
 html = this.ReplaceElements(html); 
 
------------------------------------

Extract all text contents from webpage 

public string scrapeWebsite(string url)
    {
        string extractedContent = "";
 
        WebClient wc = new WebClient();
        wc.Headers.Add("HTTP_USER_AGENT", "Web-Scraper-Agent (your-custom-user-agent-here)");
        try
        {
            // Download the web page content from the URL
            extractedContent = wc.DownloadString(url);
 
            //Remove CSS styles, if any found
            extractedContent = Regex.Replace(extractedContent, "", "");
            //Remove script blocks
            extractedContent = Regex.Replace(extractedContent, "", "");
            // Remove all images
            extractedContent = Regex.Replace(extractedContent, "", "");
            // Remove all HTML tags, leaving on the text inside.
            extractedContent = Regex.Replace(extractedContent, "<(.| )*?>", "");
            // Remove all extra spaces, tabs and repeated line-breaks
            extractedContent = Regex.Replace(extractedContent, "(x09)?", "");
            extractedContent = Regex.Replace(extractedContent, "(x20){2,}", " ");
            extractedContent = Regex.Replace(extractedContent, "(x0Dx0A)+", " ");
        }
        catch (Exception e)
        {
            extractedContent = "Error on downloading: " + e.Message;
        }
        return extractedContent;
    }
 

0 Comments:

Post a Comment