哈哈,以神之名召唤尔等(正则网络爬虫)

来源:互联网 发布:linux 设置启动级别 编辑:程序博客网 时间:2024/06/02 17:56
package 我的虫子;import java.net.*;import java.util.regex.*;import java.io.*;public class Chongzi {public static void main(String[] args) throws Exception {File fi=new File("f:\\1.txt");String str="";BufferedWriter buw=new BufferedWriter(new FileWriter(fi,true));for(int i=1;i<1000;i++){String haha="";String regx1="[\\<][/][a][\\>][\\<][\"][\\>]";//关注贴吧查看贴吧排名页复制网址替代URLURL u=new URL("http://tieba.baidu.com/f/like/furank?kw=java&pn="+i);BufferedReader bur=new BufferedReader(new InputStreamReader(u.openStream()));String regx="[\"][\\>](.){3,20}[\\<][//][a][/>][/<]";//获得匹配器Pattern p=Pattern.compile(regx);//将匹配器传入String line=null;while((line=bur.readLine())!=null){Matcher ma=p.matcher(line);while(ma.find()){haha=haha+ma.group();//去除经过正则后仍然会有的全吧搜索字样if(haha.contains("全吧搜索")){haha=haha.substring(6, haha.length()-1);}}}String[] arr1=haha.split(regx1);int count=0;for(int j=0;j<arr1.length;j++){if(arr1[j].contains("</a>\">")){arr1[j]=arr1[j].substring(6, arr1[j].length()); }if(arr1[j].contains("</a><")){arr1[j]=arr1[j].substring(0, arr1[j].length()-5);}//通过设置j>2使得排除每页都会出现的吧主if(j>2){count++;//由于贴吧召唤的限制,故选择每行显示5人int a=count%5;str=str+" "+"@"+arr1[j];if(a==0){buw.write(str+"\r\n");buw.flush();str="";}}}}buw.close();}}

0 0