数据爬
来源:互联网 发布:坐动车下载什么软件 编辑:程序博客网 时间:2024/06/10 00:26
问题导读:
1.了解爬虫编写的基本思路
2.扩充淘淘商城数据数据量
技术:
抓取京东色商品数据
数据保存到数据库中和索引库中
使用线程池进行多线程并发
使用jsoup分析页面
使用httpclient连接池实现http请求管理
Spring+Mybaits+HttpClient+Lucene+Jsoup
编写爬虫分析:
1.指定入口页面
2.根据规则抓取页面数据
3.根据html源码,获取到所需要的内容
4.存储(mysql、索引库)
抓取图片:
$("#J_goodsList li").eq(0).find(".p-img img").eq(1).attr("src");"//img13.360buyimg.com/n7/jfs/t2593/193/3399807000/138331/3d4afe1b/578c40f1N8418b28a.jpg"$("#J_goodsList li").eq(0).find(".p-price strong").attr("data-price");"5499.00"$("#J_goodsList li").eq(0).find(".p-name a").attr("title");"联想(Lenovo)拯救者 ISK15.6英寸游戏笔记本电脑(i5-6300HQ 8G 1T HDD GTX960M 4G独显 FHD IPS屏 )黑"$("#J_goodsList li").eq(0).find(".p-commit strong").text()"已有4.2万+人评价"
JD查询商品价格的接口:
https://p.3.cn/prices/mgets?skuIds=J_2600210
多个商品返回,中间使用“,”分隔:
https://chat1.jd.com/api/checkChat?pidList=2605210
主代码实现:
package cn.itcast.crawler;import java.util.Map;import org.springframework.context.ApplicationContext;import org.springframework.context.support.ClassPathXmlApplicationContext;import cn.itcast.crawler.thread.ThreadPool;public class Main { public static ApplicationContext applicationContext; public static void main(String[] args) throws Exception { applicationContext = new ClassPathXmlApplicationContext("spring/applicationContext*.xml"); //从Spring容器中获取到所有可以执行的爬虫,并且放到线程池中执行 Map<String, Crawler> map = applicationContext.getBeansOfType(Crawler.class); for (Crawler crawler : map.values()) { ThreadPool.runInThread(crawler); } }}
ThreadPool
package cn.itcast.crawler.thread;import java.util.concurrent.ArrayBlockingQueue;import java.util.concurrent.BlockingQueue;import java.util.concurrent.ThreadPoolExecutor;import java.util.concurrent.ThreadPoolExecutor.AbortPolicy;import java.util.concurrent.TimeUnit;import cn.itcast.crawler.Main;import cn.itcast.crawler.service.PropertieService;public class ThreadPool { // 线程池维护线程的最少数量 private static final int COREPOOLSIZE = 2; // 线程池维护线程的最大数量 private static final int MAXINUMPOOLSIZE = Integer.valueOf(Main.applicationContext.getBean(PropertieService.class).MAX_POOL_SIZE); // 线程池维护线程所允许的空闲时间 private static final long KEEPALIVETIME = 4; // 线程池维护线程所允许的空闲时间的单位 private static final TimeUnit UNIT = TimeUnit.SECONDS; // 线程池所使用的缓冲队列,这里队列大小为3 private static final BlockingQueue<Runnable> WORKQUEUE = new ArrayBlockingQueue<Runnable>(3); // 线程池对拒绝任务的处理策略:AbortPolicy为抛出异常;CallerRunsPolicy为重试添加当前的任务,他会自动重复调用execute()方法;DiscardOldestPolicy为抛弃旧的任务,DiscardPolicy为抛弃当前的任务 private static final AbortPolicy HANDLER = new ThreadPoolExecutor.AbortPolicy(); private static ThreadPoolExecutor threadPool = new ThreadPoolExecutor(COREPOOLSIZE, MAXINUMPOOLSIZE, KEEPALIVETIME, UNIT, WORKQUEUE, HANDLER); /** * 加入到线程池中执行 * * @param runnable */ public static void runInThread(Runnable runnable) { threadPool.execute(runnable); }}
spring中的配置文件:
<!-- 平板电视 --> <bean class="cn.itcast.crawler.JD3CCrawler"> <!-- 设置url {page} 是分页参数 --> <constructor-arg index="0" value="http://list.jd.com/list.html?cat=737,794,798&page={page}"/> <!-- 设置对应系统中的类目ID --> <constructor-arg index="1" value="76"/> <!-- 系统服务 --> <property name="httpService" ref="httpService"/> <property name="itemMapper" ref="itemMapper"/> <property name="indexWriter" ref="indexWriter"/> </bean> <!-- 手机 --> <bean class="cn.itcast.crawler.JD3CCrawler"> <!-- 设置url {page} 是分页参数 --> <constructor-arg index="0" value="http://list.jd.com/list.html?cat=9987,653,655&page={page}"/> <!-- 设置对应系统中的类目ID --> <constructor-arg index="1" value="560"/> <!-- 系统服务 --> <property name="httpService" ref="httpService"/> <property name="itemMapper" ref="itemMapper"/> <property name="indexWriter" ref="indexWriter"/> </bean>
JD3CCrawler
package cn.itcast.crawler;import java.util.ArrayList;import java.util.Collection;import java.util.HashMap;import java.util.List;import java.util.Map;import org.apache.commons.lang3.StringUtils;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import cn.itcast.crawler.pojo.Item;import cn.itcast.crawler.pojo.ItemDesc;import com.fasterxml.jackson.databind.JsonNode;import com.fasterxml.jackson.databind.ObjectMapper;import com.fasterxml.jackson.databind.node.ArrayNode;public class JD3CCrawler extends BaseCrawler { private static final Logger LOGGER = LoggerFactory.getLogger(JD3CCrawler.class); private String baseUrl; private Long cid; private static final String DETAIL_URL = "http://d.3.cn/desc/{id}"; private static final String PRICE_URL = "http://p.3.cn/prices/mgets?skuIds="; private static final String AD_URL = "http://ad.3.cn/ads/mgets?skuids="; private static final ObjectMapper MAPPER = new ObjectMapper(); /** * * @param baseUrl like http://list.jd.com/list.html?cat=737,794,798&page={page} */ public JD3CCrawler(String baseUrl, Long cid) { this.baseUrl = baseUrl; this.cid = cid; } @Override protected Collection<Item> doParser(String html) { Document document = Jsoup.parse(html);// 解析列表页面 Elements lis = document.select("#plist li.gl-item");// 获取到商品列表的 Map<String, Item> items = new HashMap<String, Item>(); for (Element li : lis) { Item item = new Item(); String id = li.select("div.j-sku-item").attr("data-sku"); String title = li.select(".p-name").text(); String image = li.select(".p-img img").attr("data-lazy-img"); String desc = getContent(id); desc = StringUtils.replace(desc, "data-lazyload", "src"); item.setId(Long.valueOf(id)); item.setImage(image); item.setTitle(title); item.setNum(99999L); item.setCid(this.cid); item.setStatus(1); ItemDesc itemDesc = new ItemDesc(); itemDesc.setItemId(item.getId()); itemDesc.setItemDesc(desc); item.setItemDesc(itemDesc); items.put(id, item); } // 获取价格 List<String> ids = new ArrayList<String>(); for (String id : items.keySet()) { ids.add("J_" + id); } try { String priceJson = doGet(PRICE_URL + StringUtils.join(ids, ',')); ArrayNode arrayNode = (ArrayNode) MAPPER.readTree(priceJson); for (JsonNode jsonNode : arrayNode) { String id = StringUtils.substringAfter(jsonNode.get("id").asText(), "_"); Long price = jsonNode.get("p").asLong() * 1000; items.get(id).setPrice(price); } } catch (Exception e) { e.printStackTrace(); } // 获取卖点(广告) ids = new ArrayList<String>(); for (String id : items.keySet()) { ids.add("AD_" + id); } try { String adJson = doGet(AD_URL + StringUtils.join(ids, ',')); ArrayNode arrayNode = (ArrayNode) MAPPER.readTree(adJson); for (JsonNode jsonNode : arrayNode) { String id = StringUtils.substringAfter(jsonNode.get("id").asText(), "_"); String ad = jsonNode.get("ad").asText(); items.get(id).setSellPoint(ad); } } catch (Exception e) { e.printStackTrace(); } return items.values(); } private String getContent(String id) { String url = StringUtils.replace(DETAIL_URL, "{id}", id); String html = null; try { html = super.doGet(url, "GBK"); if (StringUtils.contains(html,"404 Not Found")) { LOGGER.info("查询不到商品描述数据....... url = " + url); return null; } } catch (Exception e) { e.printStackTrace(); return null; } String jsonData = StringUtils.substringAfter(html, "showdesc("); jsonData = StringUtils.substringBeforeLast(jsonData, ")"); // 解析json获取内容数据 try { JsonNode jsonNode = MAPPER.readTree(jsonData); return jsonNode.get("content").asText(); } catch (Exception e) { e.printStackTrace(); } return null; } @Override protected String getPageUrl(Integer page) { return StringUtils.replace(this.baseUrl, "{page}", page + ""); } @Override protected Integer getTotalPage() { String html = null; try { html = super.doGet(getPageUrl(1)); } catch (Exception e) { LOGGER.error("getTotalPage error !", e); return 0; } Document document = Jsoup.parse(html); String pageHtml = document.select(".p-skip").html(); String[] no = pageHtml.split("\\D+"); return Integer.valueOf(no[1]); }}
HttpService
package cn.itcast.crawler.service;import java.io.File;import java.net.URI;import java.util.Map;import org.apache.commons.io.FileUtils;import org.apache.commons.io.IOUtils;import org.apache.http.client.config.RequestConfig;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.client.utils.URIBuilder;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.util.EntityUtils;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import org.springframework.beans.factory.annotation.Autowired;import org.springframework.stereotype.Service;@Servicepublic class HttpService { private static final String CHARSET = "UTF-8"; private static final Logger LOGGER = LoggerFactory.getLogger(HttpService.class); @Autowired private CloseableHttpClient httpClient; @Autowired private RequestConfig requestConfig; public String doGet(String url) throws Exception { return doGet(url, null, "UTF-8"); } /** * 执行GET请求 */ public String doGet(String url, String encode) throws Exception { return doGet(url, null, encode); } /** * 执行GET请求 */ public String doGet(String url, Map<String, String> parapms, String encode) throws Exception { URI uri = null; if (null == parapms) { uri = URI.create(url); } else { // 设置参数 URIBuilder builder = new URIBuilder(url); for (Map.Entry<String, String> entry : parapms.entrySet()) { builder.addParameter(entry.getKey(), entry.getValue()); } uri = builder.build(); } LOGGER.info("执行Http Get请求,URL:" + uri); HttpGet httpGet = new HttpGet(uri); httpGet.setConfig(requestConfig); CloseableHttpResponse response = httpClient.execute(httpGet); try { if(encode == null){ encode = CHARSET; } return EntityUtils.toString(response.getEntity(), encode); } finally { response.close(); } } /** * 下载文件 * * @param url 文件url * @param dest 目标目录 * @throws Exception */ public void downloadFile(String url, File dest) throws Exception { LOGGER.info("下载文件,URL:" + url); HttpGet httpGet = new HttpGet(url); httpGet.setConfig(requestConfig); CloseableHttpResponse response = httpClient.execute(httpGet); try { FileUtils.writeByteArrayToFile(dest, IOUtils.toByteArray(response.getEntity().getContent())); } finally { response.close(); } }}
ItemMapper
package cn.itcast.crawler.mapper;import java.util.Collection;import org.apache.ibatis.annotations.Param;import cn.itcast.crawler.pojo.Item;public interface ItemMapper { /** * 新增商品 * * @param item * 商品对象 * @return */ public Long saveItems(@Param("items") Collection<Item> items);}
BaseCrawler
package cn.itcast.crawler;import java.io.IOException;import java.util.ArrayList;import java.util.Collection;import java.util.HashMap;import java.util.List;import java.util.Map;import java.util.UUID;import org.apache.commons.lang3.StringUtils;import org.apache.lucene.document.Document;import org.apache.lucene.index.IndexWriter;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import cn.itcast.crawler.mapper.ItemMapper;import cn.itcast.crawler.pojo.Item;import cn.itcast.crawler.service.HttpService;import cn.itcast.crawler.thread.ThreadPool;public abstract class BaseCrawler implements Crawler { private static final Logger LOGGER = LoggerFactory.getLogger(BaseCrawler.class); private HttpService httpService; private ItemMapper itemMapper; private IndexWriter indexWriter; /** * 开始抓取数据 */ public void start() { Integer totalPage = getTotalPage(); // 分页抓取 for (int i = 1; i <= totalPage; i++) { LOGGER.info("当前第{}页,总共{}页。", i, totalPage); Collection<Item> items = doStart(i); if (items == null) { LOGGER.info("抓取到 0 条数据"); continue; } LOGGER.info("抓取到{}条数据", items.size()); // 下载图片,将文档中的图片地址替换成自己的url Map<String, String> urlMapping = new HashMap<String, String>(); for (Item item : items) { // 下载商品的图片 String newName = StringUtils.replace(UUID.randomUUID().toString(), "-", "") + "." + StringUtils.substringAfterLast(item.getImage(), "."); urlMapping.put(item.getImage(), newName); item.setImage("http://image.taotao.com/jd/" + newName); // 下载商品描述中的图片 // String desc = item.getItemDesc().getItemDesc(); // org.jsoup.nodes.Document document = Jsoup.parse(desc); // Elements imgs = document.select("img"); // List<String> find = new ArrayList<String>(); // List<String> replace = new ArrayList<String>(); // for (Element element : imgs) { // String imgUrl = element.hasAttr("data-lazyload") ? element.attr("data-lazyload") // : element.attr("src"); // String newName2 = StringUtils.replace(UUID.randomUUID().toString(), "-", "") + // "." + StringUtils.substringAfterLast(imgUrl, "."); // urlMapping.put(imgUrl, newName2); // find.add(imgUrl); // replace.add("http://image.taotao.com/jd/"+newName2); // } // // desc = StringUtils.replaceEach(desc, find.toArray(new String[]{}), // replace.toArray(new String[]{})); // //设置回对象中 // item.getItemDesc().setItemDesc(desc); } // 启动新线程下载图片 ThreadPool.runInThread(new ImageDownloadCrawler(urlMapping)); // 保存商品数据 saveDataToDB(items); LOGGER.info("将数据保存到数据库完成 ({})!", items.size()); // 写入到索引库 saveDataToLucene(items); LOGGER.info("将数据保存到索引库完成 ({})!", items.size()); } } private void saveDataToDB(Collection<Item> items) { itemMapper.saveItems(items); } private void saveDataToLucene(Collection<Item> items) { List<Document> docs = new ArrayList<Document>(items.size()); for (Item item : items) { try { docs.add(item.toDocument()); } catch (Exception e) { e.printStackTrace(); } } try { this.indexWriter.addDocuments(docs); this.indexWriter.commit(); } catch (IOException e) { LOGGER.error("写入索引库失败!", e); } } public String doGet(String url) throws Exception { return this.httpService.doGet(url); } public String doGet(String url, String encode) throws Exception { return this.httpService.doGet(url, encode); } /** * 抓取获取到商品集合 * * @param page * @return */ protected Collection<Item> doStart(Integer page) { String url = getPageUrl(page); LOGGER.info(" URL is " + url); String html = null; try { html = this.httpService.doGet(url); } catch (Exception e) { e.printStackTrace(); } if (html == null) { return null; } return doParser(html); } /** * 解析html,生成Item对象 * * @param html * @return */ protected abstract Collection<Item> doParser(String html); /** * 根据页数得到url * * @param page * @return */ protected abstract String getPageUrl(Integer page); /** * 获取总页数 * * @return */ protected abstract Integer getTotalPage(); @Override public void run() { start(); } public void setHttpService(HttpService httpService) { this.httpService = httpService; } public void setItemMapper(ItemMapper itemMapper) { this.itemMapper = itemMapper; } public void setIndexWriter(IndexWriter indexWriter) { this.indexWriter = indexWriter; }}
0 0
- 数据爬
- 爬数据
- 爬取广州天气数据和数据可视化
- 爬取高考数据
- 证券数据爬取
- R语言 爬数据
- php爬取数据
- Python爬虫爬数据
- wget爬数据命令
- 动态数据爬取
- python爬取数据
- Python爬数据
- mitmproxy爬APP数据
- htmlunit爬取数据
- admob数据爬取
- facebook数据爬取
- 数据
- 数据
- 一辆停在广场上的献血车引发的惨案
- 书单列表
- 【14.06%】【hdu 5904】LCIS
- android属性动画总结
- Java Web 工作原理
- 数据爬
- 异常
- MySQL 如何利用一条语句实现类似于if-else条件语句的判断
- JAVA基础知识点梳理七:封装
- oracle中相关参数的修改
- Java-克隆
- 上海迪士尼乐园史上最强游玩攻略
- 很喜欢CSDN这个社区,所以来安家了
- PCB布线技巧(经验)