I am working on a simple web crawler to get a URL, Crawl first level links on the site and extract mails from all pages using RegEx...
我知道这种说法是假的,它只是开端,但我总是在发言2分钟后“及时”。
private void button1_Click(object sender, System.EventArgs e)
{
string url = textBox1.Text;
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
StreamReader sr = new StreamReader(response.GetResponseStream());
string code = sr.ReadToEnd();
string re = "href="(.*?)"";
MatchCollection href = Regex.Matches(code, @re, RegexOptions.Singleline);
foreach (Match h in href)
{
string link = h.Groups[1].Value;
if (!link.Contains("http://"))
{
HttpWebRequest request2 = (HttpWebRequest)WebRequest.Create(url + link);
HttpWebResponse response2 = (HttpWebResponse)request2.GetResponse();
StreamReader sr2 = new StreamReader(response.GetResponseStream());
string innerlink = sr.ReadToEnd();
MatchCollection m2 = Regex.Matches(code, @"([w-]+(.[w-]+)*@([a-z0-9-]+(.[a-z0-9-]+)*?.[a-z]{2,6}|(d{1,3}.){3}d{1,3})(:d{4})?)", RegexOptions.Singleline);
foreach (Match m in m2)
{
string email = m.Groups[1].Value;
if (!listBox1.Items.Contains(email))
{
listBox1.Items.Add(email);
}
}
}
}
sr.Close();
}