python

来源:互联网 发布:淘宝 废铁战士 编辑:程序博客网 时间:2024/06/11 15:06
import requestsfrom pyquery import PyQuery as pqimport reimport pandasimport datetimedef save_as_csv(houses_info_list):    houses_df = pandas.DataFrame(houses_info_list)    houses_df.to_csv('{}上海二手房信息.csv'.format(datetime.date.today().strftime('%Y-%m-%d')))def get_house_info_list(url):    domain = url+'{}'    html = requests.get(url).text    house_items = pq(html).find('.houseList > .list > .info').items()    house_list = []    for house in house_items:        next_url = domain.format(house.find('.title > a').attr('href'))        house_info = get_house(next_url)        house_list.append(house_info)    return house_listdef get_house(url):    info = {}    html = requests.get(url).text    doc = pq(html)    info['标题'] = doc.find('#lpname').text()    info['价格'] = doc.find('div.trl-item.sty1').text()    items = doc.find('.trl-item1').items()    for item in items:        value, key = item.text().strip().split()        info[key] = value    items = doc.find('.trl-item2').items()    for item in items:        key = ''.join(item.find('.lab').text().split())        value = ''.join(item.find('.rcont').text().split())        info[key] = value.replace('地图', '')    info['联系人'] = doc.find('#agentname').text()    info['联系方式'] = doc.find('#mobilecode').text()    items = doc.find('.qu_bianqu1 > .text-item').items()    for item in items:        key = item.find('.lab').text()        value = item.find('.rcont').text()        if key == '挂牌时间':            value = re.compile(r'(\d{4}-\d{2}-\d{2})').search(value).group(1)        info[key] = value    print(info)    return infoif __name__ == '__main__':    houses_info_list = get_house_info_list('http://esf.sh.fang.com/')    save_as_csv(houses_info_list)

原创粉丝点击