[C#]网络扒虫

来源：互联网发布：linux nc命令端口编辑：程序博客网时间：2024/06/11 10:39

这个其实不能算是完全意义上的网络扒虫，只是对某个社交网络进行扒取，然后得到邻接矩阵，以及相应的头像等信息。

主要的步骤：

1，扒取信息

2，正则匹配

正则表达式主要参考了：http://deerchao.net/tutorials/regex/regex.htm

扒取信息中用的是 WebClient这个方法相对HttpRequest的HttpResponse更简洁一些。

难点是克服网站的认证机制，用的是保存Cookies的方法。

扒虫部分代码：

using System;

using System.Net;

using System.Text;

using System.Text.RegularExpressions;

public class Crawler

{

public static string GetCont(string url)//扒取页面

{

string cookies = "_r01_=; depovince=BJ; p=; ap=; t=; societyguester=55a777838c4286ab5f657382dbd25c736; id=; xnsid=";

WebClient WebC = new WebClient();

WebC.Headers.Add("Cookie", cookies);

byte[] WebPa = WebC.DownloadData(url);

string PageHtml = Encoding.UTF8.GetString(WebPa);

return PageHtml;

}

public static void GetImag(string ImgUrl,string UserName)//下载小图片

{

string imageFileName;

string imageFilePath;

WebClient myClient = new WebClient();

Regex regex = new Regex("//w*");

MatchCollection UsNameMatches = regex.Matches(UserName);

imageFileName = UsNameMatches[0].Value.ToString() + ".jpg";

imageFilePath = @"D:/picture/" + imageFileName;

try

{

myClient.DownloadFile(ImgUrl, imageFilePath);

}

catch

{

}

//正则表达式部分

using System.Text;

using System.Text.RegularExpressions;

public class MyRegex

{

public static string[] GetAddr(string PageHtml)

{

string[] PageUrl=new string[24];

Regex regex = new Regex("http://www..com/profile.do//?portal=//w*&id=//d+(?=/"//stitle=)");

MatchCollection urlMatches = regex.Matches(PageHtml);

for(int i=0;i<urlMatches.Count;i++)

{

PageUrl[i]=urlMatches[i].Value.ToString();

}

return PageUrl;

}

public static string[] GetImgAddr(string PageHtml)

{

string[] ImgAddr = new string[24];

Regex regex = new Regex("(?<=stats=/"pf_friend/"//ssrc//=/").*(?=/"//swidth=/"50/"//s/>)");

MatchCollection ImgMatches = regex.Matches(PageHtml);

for(int i=0;i<ImgMatches.Count;i++)

{

ImgAddr[i] = ImgMatches[i].Value.ToString();

}

return ImgAddr;

}

public static string[] GetUsName(string PageHtml)

{

string[] UsName = new string[24];

Regex regex = new Regex("(?<=title=/"查看).*(?=的个人主页/">//W<img//sstats=/"pf_friend/")");

MatchCollection UsNameMatches = regex.Matches(PageHtml);

for (int i = 0; i < UsNameMatches.Count; i++)

{

UsName[i] = UsNameMatches[i].Value.ToString();

}

return UsName;

}