python爬取廖雪峰javascript教程

来源：互联网发布：suse linux设置网络编辑：程序博客网时间：2024/06/03 01:45

这篇文章和我上一篇的爬取廖雪峰python3教程一样，只是用到了一些不一样的东西，在下面的学习中还是会定时更新一些东西供大家阅读

#coding:utf-8import requests,lxml,os,sys,codecsfrom bs4 import BeautifulSoupdef get_url(url):    headers = {        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'}    re=requests.get(url,headers=headers)    html=re.text    all_url=[]    soup=BeautifulSoup(html,'lxml')    all_a=soup.find('div',class_='x-sidebar-left-content').find_all('a')    for a in all_a:        all_url.append('http://www.liaoxuefeng.com' + a.get('href'))    return all_urldef get_text(all_url):    headers = {        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'}    for url in all_url:        re=requests.get(url,headers=headers)        html=re.text        soup=BeautifulSoup(html,'lxml')        all_h4 = soup.find_all('h4')        div = soup.find('div', class_='x-wiki-content')        content=all_h4[0].text+div.text        if all_h4[0].text=='map/reduce':   #这是判断里面的名字可能和路径相冲突            filename=os.path.normpath('g:\\file\\'+'map and reduce'+'.js')        else:            filename=os.path.normpath('g:\\file\\'+all_h4[0].text+'.js')        print u'正在下载' + all_h4[0].text        with codecs.open(filename, 'w', encoding='utf-8') as f:  # 将信息写入py文件中            f.write(content)if __name__=="__main__":    url='http://www.liaoxuefeng.com/wiki/001434446689867b27157e896e74d51a89c25cc8b43bdb3000'    all_url=get_url(url)    get_text(all_url)

0 0