java-抓取指定URL网页的内容

来源:互联网 发布:淘宝司法拍卖靠谱吗 编辑:程序博客网 时间:2024/06/11 16:42

    由于做的工程实践关于爬虫的,本来打算用Python写,但是发现没有Python写爬虫的书籍,但网上有一些博客,文章之类,看着不够系统,完全找不到感觉,索性学java写爬虫吧,毕竟有本书专门讲解的,下面是我照抄书上的源代码,加上部分自己写的代码.

import java.io.*;import java.net.*;import org.apache.commons.httpclient.HttpClient;import org.apache.commons.httpclient.methods.GetMethod;public class UrlMethod{public static void  main(String arg[])throws Exception{HttpClient httpClient = new HttpClient();//创建一个客户端,相当于打开一个浏览器//创建一个get方法,类似于在浏览器地址栏输入一个地址GetMethod getMethod = new GetMethod("http://www.baidu.com");//回车,获得响应状态码int status = httpClient.executeMethod(getMethod);System.out.println("response = " + getMethod.getResponseBodyAsString());System.out.println("getMethod's name" + getMethod.getName());System.out.println("getMethodBase:" + getMethod.getPath());System.out.println("statusText:" + getMethod.getStatusText());System.out.println("getMethod to string:" + getMethod.toString());getMethod.releaseConnection();//释放链接}}

output:

response = <!doctype html><html><head><meta http-equiv="Content-Type" content="text/html;charset=gb2312"><title>百度一下,你就知道      </title><style>html,body{height:100%;}html{overflow-y:auto}#wrapper{position:relative;_position:;min-height:100%}#content{padding-bottom:100px;text-align:center;}#ftCon{height:100px;position:absolute;bottom:44px;text-align:center;width:100%;margin:0 auto;z-index:0;overflow:hidden;}#ftConw{width:720px;margin:0 auto;}body{font:12px arial;text-align:;background:#fff}body,p,form,ul,li{margin:0;padding:0;list-style:none}body,form,#fm{position:relative}td{text-align:left}img{border:0}a{color:#00c}a:active{color:#f60}#u{color:#999;padding:4px 10px 5px 0;text-align:right}#u a{margin:0 5px}#u .reg{margin:0}#m{width:720px;margin:0 auto;}#nv a,#nv b,.btn,#lk{font-size:14px}#fm{padding-left:110px;text-align:left;z-index:1;}input{border:0;padding:0}#nv{height:19px;font-size:16px;margin:0 0 4px;text-align:left;text-indent:137px;}.s_ipt_wr{width:418px;height:30px;display:inline-block;margin-right:5px;background:url(http://s1.bdstatic.com/r/www/img/i-1.0.0.png) no-repeat -304px 0;border:1px solid #b6b6b6;border-color:#9a9a9a #cdcdcd #cdcdcd #9a9a9a;vertical-align:top}.s_ipt{width:405px;height:22px;font:16px/22px arial;margin:5px 0 0 7px;background:#fff;outline:none;-webkit-appearance:none}.s_btn{width:95px;height:32px;padding-top:2px\9;font-size:14px;background:#ddd url(http://s1.bdstatic.com/r/www/img/i-1.0.0.png);cursor:pointer}.s_btn_h{background-position:-100px 0}.s_btn_wr{width:97px;height:34px;display:inline-block;background:url(http://s1.bdstatic.com/r/www/img/i-1.0.0.png) no-repeat -202px 0;*position:relative;z-index:0;vertical-align:top}#lg img{vertical-align:top;margin-bottom:3px}#lk{margin:33px 0}#lk span{font:14px "宋体"}#lm{height:60px}#lh{margin:16px 0 5px;word-spacing:3px}.tools{position:absolute;top:-4px;*top:10px;right:7px;}#mHolder{width:62px;position:relative;z-index:296;display:none}#mCon{height:18px;line-height:18px;position:absolute;cursor:pointer;padding:0 18px 0 0;background:url(http://s1.bdstatic.com/r/www/img/bg-1.0.0.gif) no-repeat right -134px;background-position:right -136px\9}#mCon span{color:#00c;cursor:default;display:block}#mCon .hw{text-decoration:underline;cursor:pointer}#mMenu a{width:100%;height:100%;display:block;line-height:22px;text-indent:6px;text-decoration:none;filter:none\9}#mMenu,#user ul{box-shadow:1px 1px 2px #ccc;-moz-box-shadow:1px 1px 2px #ccc;-webkit-box-shadow:1px 1px 2px #ccc;filter: progid:DXImageTransform.Microsoft.Shadow(Strength=2, Direction=135, Color="#cccccc")\9;}#mMenu{width:56px;border:1px solid #9b9b9b;list-style:none;position:absolute;right:27px;top:28px;display:none;background:#fff}#mMenu a:hover{background:#ebebeb}#mMenu .ln{height:1px;background:#ebebeb;overflow:hidden;font-size:1px;line-height:1px;margin-top:-1px}#cp,#cp a{color:#666666;}#seth{display:none;behavior:url(#default#homepage)}#setf{display:none;}#sekj{margin-left:14px;}</style>
<script type="text/javascript">function h(obj){obj.style.behavior='url(#default#homepage)';var a = obj.setHomePage('http://www.baidu.com/');}</script></head>

<body><div id="wrapper"><div id="content">

<div id="ie6tipcon"></div>


<div id="u"><a href="http://www.baidu.com/gaoji/preferences.html" name="tj_setting">搜索设置</a>|<a href="https://passport.baidu.com/v2/?login&tpl=mn&u=http%3A%2F%2Fwww.baidu.com%2F" name="tj_login" id="lb" onclick="return false;">登录</a><a href="https://passport.baidu.com/v2/?reg&regType=1&tpl=mn&u=http%3A%2F%2Fwww.baidu.com%2F" target="_blank" name="tj_reg" class="reg">注册</a></div>
<div id="m"><p id="lg"><img src="http://www.baidu.com/img/baidu_sylogo1.gif" width="270" height="129" ></p>
<p id="nv"><a href="http://news.baidu.com">新&nbsp;闻</a> <b>网&nbsp;页</b> <a href="http://tieba.baidu.com">贴&nbsp;吧</a> <a href="http://zhidao.baidu.com">知&nbsp;道</a> <a href="http://music.baidu.com">音&nbsp;乐</a> <a href="http://image.baidu.com">图&nbsp;片</a> <a href="http://video.baidu.com">视&nbsp;频</a> <a href="http://map.baidu.com">地&nbsp;图</a></p><div id="fm"><form name="f" action="/s"><span class="s_ipt_wr"><input type="text" name="wd" id="kw" maxlength="100" class="s_ipt"></span><input type="hidden" name="rsv_bp" value="0"><input type="hidden" name="rsv_spt" value="3"><span class="s_btn_wr"><input type="submit" value="百度一下" id="su" class="s_btn" onmousedown="this.className='s_btn s_btn_h'" onmouseout="this.className='s_btn'"></span></form><span class="tools"><span id="mHolder"><div id="mCon"><span>输入法</span></div></span></span><ul id="mMenu"><li><a href="#" name="ime_hw">手写</a></li><li><a href="#" name="ime_py">拼音</a></li><li class="ln"></li><li><a href="#" name="ime_cl">关闭</a></li></ul></div>
<p id="lk"><a href="http://baike.baidu.com">百科</a> <a href="http://wenku.baidu.com">文库</a> <a href="http://www.hao123.com">hao123</a><span> | <a href="http://www.baidu.com/more/">更多&gt;&gt;</a></span></p><p id="lm"></p>
</div></div><div id="ftCon"><div id="ftConw"><p ><a id="seth" onClick="h(this)" href="/" onmousedown="return ns_c({'fm':'behs','tab':'homepage','pos':0})">把百度设为主页</a><a id="setf" href="http://www.baidu.com/cache/sethelp/index.html" onmousedown="return ns_c({'fm':'behs','tab':'favorites','pos':0})" target="_blank">把百度设为主页</a><span id="sekj"><a href="http://www.baidu.com/search/baidukuaijie_mp.html" target="_blank" onmousedown="return ns_c({'fm':'behs','tab':'kuaijie','pos':1})">把百度添加到桌面</a></span></p><p id="lh"><a href="http://e.baidu.com/?refer=888" onmousedown="return ns_c({'fm':'behs','tab':'btlink','pos':2})">加入百度推广</a> | <a href="http://top.baidu.com">搜索风云榜</a> | <a href="http://home.baidu.com">关于百度</a> | <a href="http://ir.baidu.com">About Baidu</a></p><p id="cp">&copy;2012 Baidu <a href="/duty/">使用百度前必读</a> <a href="http://www.miibeian.gov.cn" target="_blank">京ICP证030173号</a> <img src="http://www.baidu.com/cache/global/img/gs.gif"></p></div></div>
</div></body>

<script>var bds={se:{},comm : {ishome : 1,sid : "",user : "",username : "",sugHost : "http://suggestion.baidu.com/su",loginAction : []}}</script><script type="text/javascript" src="http://s1.bdstatic.com/r/www/cache/global/js/home-1.8.js"></script><script>var bdUser = null;var w=window,d=document,n=navigator,k=d.f.wd,a=d.getElementById("nv").getElementsByTagName("a"),isIE=n.userAgent.indexOf("MSIE")!=-1&&!window.opera;(function(){if(/q=([^&]+)/.test(location.search)){k.value=decodeURIComponent(RegExp["\x241"])}})();if(n.cookieEnabled&&!/sug?=0/.test(d.cookie)){bds.se.sug();};function addEV(o, e, f){if(w.attachEvent){o.attachEvent("on" + e, f);}else if(w.addEventListener){ o.addEventListener(e, f, false);}}function G(id){return d.getElementById(id);}function ns_c(q){var p = encodeURIComponent(window.document.location.href), sQ = '', sV = '', mu='', img = window["BD_PS_C" + (new Date()).getTime()] = new Image();for (v in q) {sV = q[v];sQ += v + "=" + sV + "&";} mu= "&mu=" + p ;img.src = "http://nsclick.baidu.com/v.gif?pid=201&pj=www&rsv_sid=&" + sQ + "path="+p+"&t="+new Date().getTime();return true;}if(/\bbdime=[12]/.test(d.cookie)){document.write('<script src=http://s1.bdstatic.com/r/www/cache/ime/js/openime-1.0.1.js><\/script>');}(function(){var u = G("u").getElementsByTagName("a"), nv = G("nv").getElementsByTagName("a"), lk = G("lk").getElementsByTagName("a"), un = "";var tj_nv = ["news","tieba","zhidao","mp3","img","video","map"];var tj_lk = ["baike","wenku","hao123","more"];un = bds.comm.user == "" ? "" : bds.comm.user;function _addTJ(obj){addEV(obj, "mousedown", function(e){var e = e || window.event;var target = e.target || e.srcElement;ns_c({'fm':'behs','tab':target.name||'tj_user','un':encodeURIComponent(un)});});}for(var i = 0; i < u.length; i++){_addTJ(u[i]);}for(var i = 0; i < nv.length; i++){nv[i].name = 'tj_' + tj_nv[i];}for(var i = 0; i < lk.length; i++){lk[i].name = 'tj_' + tj_lk[i];}})();(function() {var links = {'tj_news':   ['word', 'http://news.baidu.com/ns?tn=news&cl=2&rn=20&ct=1&ie=utf-8'],'tj_tieba':  ['kw', 'http://tieba.baidu.com/f?ie=utf-8'],'tj_zhidao': ['word', 'http://zhidao.baidu.com/search?pn=0&rn=10&lm=0'],'tj_mp3':    ['key', 'http://music.baidu.com/search?fr=ps&ie=utf-8'],'tj_img':    ['word', 'http://image.baidu.com/i?ct=201326592&cl=2&nc=1&lm=-1&st=-1&tn=baiduimage&istype=2&fm=&pv=&z=0&ie=utf-8'],'tj_video':  ['word', 'http://video.baidu.com/v?ct=301989888&s=25&ie=utf-8'],'tj_map':    ['wd', 'http://map.baidu.com/?newmap=1&ie=utf-8&s=s'],'tj_baike':  ['word', 'http://baike.baidu.com/search/word?pic=1&sug=1&enc=utf8'],'tj_wenku':  ['word', 'http://wenku.baidu.com/search?ie=utf-8']};var domArr = [G('nv'), G('lk')],kw = G('kw');for (var i = 0, l = domArr.length; i < l;  i++) {domArr[i].onmousedown = function(e) {e = e || window.event;var target = e.target || e.srcElement,name = target.getAttribute('name'),items = links[name],reg = new RegExp('^\\s+|\\s+\x24'),key = kw.value.replace(reg, '');if (items) {if (key.length > 0) {var wd = items[0], url = items[1],url = url + ( name === 'tj_map' ? encodeURIComponent('&' + wd + '=' + key)  : ( ( url.indexOf('?') > 0 ? '&' : '?' ) + wd + '=' + encodeURIComponent(key)  ) );target.href = url;} else {target.href = target.href.match(new RegExp('^http://.+\\.baidu\\.com'))[0];}}name && ns_c({'fm': 'behs','tab': name,'query': encodeURIComponent(key),'un': encodeURIComponent(bds.comm.user || '')    });};}})();addEV(w,"load",function(){k.focus()});w.onunload=function(){};</script><script type="text/javascript" src="http://s1.bdstatic.com/r/www/cache/global/js/tangram-1.3.4c1.0.js"></script><script type="text/javascript" src="http://s1.bdstatic.com/r/www/cache/user/js/u-1.3.4.js"></script><!--[if IE 6]><![endif]-->



</html>



<!--2eda2d7f34e418b3-->
getMethod's nameGET
getMethodBase:/
statusText:OK
getMethod to string:org.apache.commons.httpclient.methods.GetMethod@763f5d