新浪微博粉丝爬虫-wap站只能爬取20页-

来源:互联网 发布:flotherm软件英文翻译 编辑:程序博客网 时间:2024/06/08 11:15

由上图可见:微博已经视粉丝分布为商业机密,故爬取难度越来越大。无论web上爬,还是手机上爬,均受限。

两种方式:手动爬+微博API爬。

本文展示手动爬,以李易峰的粉丝分布为基础,只能爬取20页,源码如下

# encoding=utf-8import randomimport jsonimport base64import requestsfrom lxml import etreeimport reimport pymongo"""输入你的微博账号和密码,可去淘宝买,一元七个。建议买几十个,微博限制的严,太频繁了会出现302转移。或者你也可以把时间间隔调大点。"""myWeiBo = [    # {'no': '314061410@qq.com', 'psw': '123456789'},     {'no': '835163102@qq.com', 'psw': '987654321'},    #{'no': 'shudieful3618@163.com', 'psw': 'a123456'},    #{'no':'314061410@qq.com','psw':'fw381105'},]host="http://weibo.cn"scrawl_ID=set()cookies = []def getCookies(weibo):    """ 获取Cookies """        loginURL = r'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)'    for elem in weibo:        account = elem['no']        password = elem['psw']        username = base64.b64encode(account.encode('utf-8')).decode('utf-8')        postData = {            "entry": "sso",            "gateway": "1",            "from": "null",            "savestate": "30",            "useticket": "0",            "pagerefer": "",            "vsnf": "1",            "su": username,            "service": "sso",            "sp": password,            "sr": "1440*900",            "encoding": "UTF-8",            "cdult": "3",            "domain": "sina.com.cn",            "prelt": "0",            "returntype": "TEXT",        }        session = requests.Session()        r = session.post(loginURL, data=postData)        jsonStr = r.content.decode('gbk')            #print 'jsonStr=',jsonStr        info = json.loads(jsonStr)            #print 'info=',info        if info["retcode"] == "0":            print "Get Cookie Success!( Account:%s )" % account            cookie = session.cookies.get_dict()            cookies.append(cookie)        else:            print "Failed!( Reason:%s )" % info['reason']    return cookiesdef weiboLogin(ID):    cookies = getCookies(myWeiBo)    print "Get Cookies Finish!( Num:%d)" % len(cookies)    cookie=random.choice(cookies)    rich_url="http://weibo.cn/%s/fans" % ID    r=requests.post(rich_url,cookies=cookie)    return r.contentdef url_to_page(url):    cookie=random.choice(cookies)    r=requests.post(url,cookies=cookie)    if(r.status_code==requests.codes.ok):        return r.content    else:        return r.status_codedef MongoInit():    clinet = pymongo.MongoClient("localhost", 27017)    db = clinet["Sina_by_fw"]    Fans_db = db["Fans"]    print"MongoDBPipleline_init() finish****"    return Fans_dbfans_cities=[]page_count=0fans_count=0Fans_db=MongoInit()def parse_for_fans(page,IDhost):    global Fans_db,fans_cities    global page_count    global fans_count    page_count+=1    print"page_count=",page_count    IDs=set(re.findall('uid=(\d+)', page))    for ID in IDs:        if ID!=str(IDhost):# one is str ,the other is int            fans_count+=1            info_page=url_to_page("http://weibo.cn/%s/info" % ID)            expression_nick=u'\u6635\u79f0[:|\uff1a](.*?)<'            nick=re.findall(expression_nick.encode('UTF-8'),info_page)[0]            expression_city=u'\u5730\u533a[:|\uff1a](.*?)<'            city=re.findall(expression_city.encode('UTF-8'),info_page)[0]#this must be encode('UTF-8')!!!!!            print nick,city,fans_count            fans_cities.append(city)            '''            if len(fans_cities)==50:                fans_cities_dict=dict()                for i in range(len(fans_cities)):                    fans_cities_dict[str(i+1)]=fans_cities[i]                Fans_db.insert(fans_cities_dict)                del fans_cities[:]            '''                e_page=etree.HTML(page)    url_next=e_page.xpath(u'//div[@class="pa" and @id="pagelist"]/form/div/a[text()="\u4e0b\u9875"]/@href')    if url_next:        next_page=url_to_page(host+url_next[0])        parse_for_fans(next_page,IDhost)    else:        fans_cities_dict=dict()        for i in range(len(fans_cities)):            fans_cities_dict[str(i+1)]=fans_cities[i]        Fans_db.insert(fans_cities_dict)        del fans_cities[:]#系统提示:为了避免骚扰,微博智能反垃圾系统已过滤掉部分广告用户。page=weiboLogin(ID=1291477752)parse_for_fans(page,IDhost=1291477752)                


注意:1.网站的数据要经过utf-8解码

2.mongodb的数据必须要为python字典。

0 0
原创粉丝点击