scrapy框架爬取51job网

来源:互联网 发布:线条随鼠标特效源码 编辑:程序博客网 时间:2024/06/10 05:00
# -*- coding: utf-8 -*-import scrapyfrom scrapy.spiders import CrawlSpider,Rulefrom scrapy.linkextractors import LinkExtractorfrom manhua.items import ManhuaItemclass DemoSpider(CrawlSpider):    name = "demo"    #这里爬取两个网站,一个是php工程师,一个是python工  程师    start_urls=["http://search.51job.com/list/000000,000000,0000,00,9,99,php,2,1.html?lang=c&degreefrom=99&stype=&workyear=99&cotype=99&jobterm=99&companysize=99&radius=-1&address=&lonlat=&postchannel=&list_type=&ord_field=&curr_page=&dibiaoid=0&landmark=&welfare=",                "http://search.51job.com/list/000000,000000,0000,00,9,99,Python%25E5%25BC%2580%25E5%258F%2591%25E5%25B7%25A5%25E7%25A8%258B%25E5%25B8%2588,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="                ]    rules={        Rule(LinkExtractor(allow="http:\/\/search.51job.com\/list\/",restrict_xpaths="//div[@class='p_in']"),callback="paser_item",follow=True),        # Rule(LinkExtractor(allow=""))    }    def paser_item(self,response):        divs=response.xpath("//div[@class='el']")        item=ManhuaItem()        for div in divs:            try:                item['duty'] = div.xpath("./p/span/a/text()")[0].extract().strip()                item['time'] = div.xpath("./span[4]/text()").extract()                item['name'] = div.xpath("./span[1]/a/text()").extract()                item['location'] = div.xpath("./span[2]/text()").extract()                item['sallary'] = div.xpath("./span[3]/text()").extract()                yield item            except Exception:                pass*******************items.py********************import scrapyclass ManhuaItem(scrapy.Item):    # define the fields for your item here like:    # name = scrapy.Field()      name=scrapy.Field()       #公司名称    duty=scrapy.Field()     #工作的标题    location=scrapy.Field()     #公司地址    sallary=scrapy.Field()     #薪水    time=scrapy.Field()    #时间  **********************************************  最后在控制台运行即可:scrapy crawl demo -o file.csv这里我大概爬了四万多条数据,如果想要数据的可以私聊我
0 0