网络蜘蛛--抓取一个网页的邮箱

来源:互联网 发布:sql limit 数据库优化 编辑:程序博客网 时间:2024/06/02 23:04

技术:正则表达式+网络编程(URL)

package cn.hncu.br;import java.io.BufferedReader;import java.io.FileNotFoundException;import java.io.FileReader;import java.io.IOException;import java.io.InputStreamReader;import java.net.MalformedURLException;import java.net.URL;import java.util.ArrayList;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.junit.Test;public class SpiderDemo {    @Test    public void Ahelf(){        Pattern p2=Pattern.compile("\\w+@\\w+(\\.\\w)+");        System.out.println("http://sina.com.cn".matches("[a-zA-Z]+://(\\w+)(.\\w+)+(/\\w[^ ])*"));    }    @Test    public void analily(){        String regex="([\\w-\\.]+)@((\\[[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.)|(([a-zA-Z0-9\\-]+\\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\\]?)";        ///^([a-zA-Z0-9_-])+@([a-zA-Z0-9_-])+(.[a-zA-Z0-9_-])+        //\\w+@\\w+(\\.\\w+)+        Pattern p=Pattern.compile(regex);        try {            BufferedReader br=new BufferedReader(new FileReader(".\\net\\mail.txt"));            String str=null;            StringBuffer sb=new StringBuffer();//用这个类加载全部可以全部搜索            while((str=br.readLine())!=null){                sb.append(str);//考虑到换行的情况            }            String result=sb.toString();            Matcher m =p.matcher(result);            while(m.find()){                System.out.println(m.group());            }        } catch (FileNotFoundException e) {            e.printStackTrace();        } catch (IOException e) {            e.printStackTrace();        }    }    public static void main(String[] args) {//        getCurrentHTMLMail();        try {            URL url=new URL("http://www.sina.com");            getAllHTMLMail(url);        } catch (MalformedURLException e) {            e.printStackTrace();        }    }    public static void getAllHTMLMail(URL url) {            ArrayList<URL> urls=new ArrayList<URL>();            ArrayList<String> mails=new ArrayList<String>();            urls.add(url);            for(int i=0;i<urls.size();i++){                URL u=urls.get(i);                getCurrentHTMLMail(u,mails);                getHTMLaHref(u, urls);                System.out.println(mails.size());                System.out.println(mails);            }            for(String str:mails){                System.out.println(str);            }    }    public static ArrayList<URL> getHTMLaHref(URL url,ArrayList<URL> urls){        try {            BufferedReader br = new BufferedReader(new InputStreamReader(                    url.openStream()));            StringBuilder sb = new StringBuilder();//用这个类加载全部可以全部搜索            String str = null;            while ((str = br.readLine()) != null) {                sb.append(str);            }            Pattern p = Pattern.compile("(http|ftp|https)://(\\w+)(.\\w+)+(/\\w[^ ])*");            //Pattern p = Pattern.compile("<a href=\"*.html\">page</a>");            //<a href="在这里插入URL"></a>            Matcher m=p.matcher(sb);            while (m.find()) {//                System.out.println(m.group());                URL u=new URL(m.group());                urls.add(u);            }        } catch (Exception e) {        }        return urls;    }    public static ArrayList<String> getCurrentHTMLMail(URL url,ArrayList<String> mail) {        try {//            URL url=new URL("http://www.sina.com.cn");            //System.out.println(url.toString());            BufferedReader br=new BufferedReader(new InputStreamReader(url.openStream()));            StringBuilder sb=new StringBuilder();//用这个类加载全部可以全部搜索            String str=null;            while((str=br.readLine())!=null){                sb.append(str);            }            Pattern p=Pattern.compile("\\w+@\\w+(.\\w)+");            Matcher m=p.matcher(sb);            while(m.find()){//                System.out.println(m.group());                mail.add(m.group());            }        } catch (IOException e) {            e.printStackTrace();  //HTTP 403命令是禁止恶意访问此网站,不能从此网站中抓取内容        }        return mail;    }    @Test    public void getMail()  {            try {                URL url = new URL("http://127.0.0.1/");                BufferedReader br = null;                try {                    br = new BufferedReader(new InputStreamReader(                            url.openStream()));                } catch (IOException e) {                }                StringBuilder sb = new StringBuilder();//用这个类加载全部可以全部搜索                String str = null;                while ((str = br.readLine()) != null) {                    sb.append(str);                }                System.out.println(sb.toString());                Pattern p = Pattern.compile("\\w+@\\w+(.\\w)+");                Matcher m = p.matcher(sb);                while (m.find()) {                    System.out.println(m.group());                }            } catch (Exception e) {            }    }}
0 0