014-案例开发.Storm计算网站PV

来源:互联网 发布:软件皮肤下载 编辑:程序博客网 时间:2024/06/09 14:27
采用Storm统计网站的PV,需要从两个方面考虑(1) 性能问题  (2) 线程安全考虑


一、需求分析 
(1)网站最常用访问量指标
PV(page views): count (session_id)

(2)多线程下,注意线程安全问题
PV统计

方案分析
如下是否可行?
1、定义static long pv, Synchronized 控制累计操作
Synchronized 和 Lock在单JVM下有效,但在多JVM下无效
常用的代码一般为:设置static变量和通过synchronized关键字处理,代码为:
   static private long pv = 0; //多线程情况下共享变量,单JVM没有问题。分布式系统中该变量统计结果存在问题     /**     * Process a single tuple of input.     * @param input The input tuple to be processed.     */     @Override     public void execute(Tuple input ) {                  String line = input .getStringByField("line" );          synchronized (this ) {//多线程情况下代码块异步处理,单JVM没有问题                           if (StringUtils.isNotBlank( line)){                  pv ++;             }         }         Thread currentThread = Thread.currentThread();         System. out .println(currentThread .getName() + "[" +currentThread .getId()+ "]" + "->" + pv );    }

可行的两个方案:
1、shuffleGrouping下,pv * Executer并发数
2、bolt1进行多并发局部汇总,bolt2单线程进行全局汇总

线程安全:多线程处理的结果和单线程一致

二、统计PV的流程图以及Storm代码



采用Storm进行数据汇总的大致步骤: 通过以及bolt高并发多线程情况下统计出来部分的数据,然后通过单线程二级bolt进行整体汇总,请求结果
 (1) PVTopology主程序
package com.yun.storm.pv;import java.util.HashMap;import java.util.Map;import backtype.storm.Config;import backtype.storm.LocalCluster;import backtype.storm.topology.TopologyBuilder;/** * 实时统计PV拓扑 * @author shenfl * @version V1.0 */public class PVTopology {        public final static String SPOUT_ID = PVSpout.class.getSimpleName();    public final static String PVBOLT_ID = PVBolt.class.getSimpleName();    public final static String PVTOPOLOGY_ID = PVTopology.class.getSimpleName();    public final static String PVSUMBOLT_ID = PVSumBolt.class.getSimpleName();        public static void main(String[] args) {         TopologyBuilder builder = new TopologyBuilder();                   /*//表示kafka使用的zookeeper的地址         String brokerZkStr = "192.168.35:2181,192.168.36:2181,192.168.37:2181";         ZkHosts zkHosts = new ZkHosts(brokerZkStr);         //表示的是kafak中存储数据的主题名称         String topic = "pvtopic";         //指定zookeeper中的一个根目录,里面存储kafkaspout读取数据的位置等信息         String zkRoot = "/kafkaspout";         String id = UUID.randomUUID().toString();         SpoutConfig spoutconf  = new SpoutConfig(zkHosts, topic, zkRoot, id);                  builder.setSpout(SPOUT_ID , new KafkaSpout(spoutconf),1);//单线程*/          builder.setSpout( SPOUT_ID, new PVSpout(),1);          builder.setBolt( PVBOLT_ID, new PVBolt(), 4).setNumTasks(8).shuffleGrouping(SPOUT_ID );//2个线程,4个task实例          builder.setBolt( PVSUMBOLT_ID, new PVSumBolt(), 1).shuffleGrouping(PVBOLT_ID );//单线程汇总                  Map<String,Object> conf = new HashMap<String,Object>();         conf.put(Config. TOPOLOGY_RECEIVER_BUFFER_SIZE , 8);         conf.put(Config. TOPOLOGY_TRANSFER_BUFFER_SIZE , 32);         conf.put(Config. TOPOLOGY_EXECUTOR_RECEIVE_BUFFER_SIZE , 16384);         conf.put(Config. TOPOLOGY_EXECUTOR_SEND_BUFFER_SIZE , 16384);    /*    try {             StormSubmitter.submitTopology(PVTOPOLOGY_ID, conf, builder.createTopology());         } catch (Exception e) {             e.printStackTrace();         } */         LocalCluster cluster = new LocalCluster();          cluster.submitTopology( PVTOPOLOGY_ID, conf ,builder.createTopology());    }}   



模拟数据源:
package com.yun.storm.pv;import java.io.File;import java.io.IOException;import java.util.Collection;import java.util.List;import java.util.Map;import org.apache.commons.io.FileUtils;import com.yun.redis.PropertyReader;import backtype.storm.spout.SpoutOutputCollector;import backtype.storm.task.TopologyContext;import backtype.storm.topology.OutputFieldsDeclarer;import backtype.storm.topology.base.BaseRichSpout;import backtype.storm.tuple.Fields;import backtype.storm.tuple.Values;/*** 实时读取PV用户行为日志数据 可以从数据库读取数据,可以从kafka读取数据,可以从文件系统读取数据** @author shenfl* @version V1.0*/public class PVSpout extends BaseRichSpout {     /**     *     */     private static final long serialVersionUID = 1L;     private SpoutOutputCollector collector;     private Map stormConf;     /**     * 当PVSpout初始化时候调用一次     *     * @param conf     *            The Storm configuration for this spout.     * @param context     *            可以获取每个任务的TaskID     * @param collector     *            The collector is used to emit tuples from this spout.     */     @Override     public void open(Map stormConf, TopologyContext context, SpoutOutputCollector collector) {          this.collector = collector;          this.stormConf = stormConf;     }     /**     * 死循环,一直会调用     */     @Override     public void nextTuple() {          // 获取数据源          try {               String dataDir = PropertyReader.getProperty("parameter.properties", "data.dir");               File file = new File(dataDir);               //获取文件列表               Collection<File> listFiles = FileUtils.listFiles(file, new String[]{"log"},true);                             for (File f : listFiles) {                    //处理文件                    List<String> readLines = FileUtils.readLines(f);                    for (String line : readLines) {                         this.collector.emit(new Values(line));                    }                    // 文件已经处理完成                    try {                         File srcFile = f.getAbsoluteFile();                         File destFile = new File(srcFile + ".done." + System.currentTimeMillis());                         FileUtils.moveFile(srcFile, destFile);                    } catch (IOException e) {                         e.printStackTrace();                    }               }          } catch (Exception e) {               e.printStackTrace();          }     }     /**     * Declare the output schema for all the streams of this topology.     *     * @param declarer     *            this is used to declare output stream ids, output fields, and     *            whether or not each output stream is a direct stream     */     @Override     public void declareOutputFields(OutputFieldsDeclarer declarer) {          declarer.declare(new Fields("line"));     }}


 (2) 一级bolt,高并发情况局部汇总
package com.yun.storm.pv;import java.util.Map;import org.apache.commons.lang.StringUtils;import backtype.storm.task.OutputCollector;import backtype.storm.task.TopologyContext;import backtype.storm.topology.OutputFieldsDeclarer;import backtype.storm.topology.base.BaseRichBolt;import backtype.storm.tuple.Fields;import backtype.storm.tuple.Tuple;import backtype.storm.tuple.Values;/** * 获取PVSpout发送的数据,PVTopology开启多线程。 给出每个线程处理的PV数 * * 在多线程情况下,对PV数据只能局部汇总,不能整体汇总,可以把局部汇总的结果给一个单线程的BOLT进行整体汇总(PVSumBolt) * * @author shenfl * @version V1.0 */public class PVBolt extends BaseRichBolt {    /**     *     */    private static final long serialVersionUID = 1L;    private OutputCollector collector;    private TopologyContext context;    /**     * 实例初始化的时候调用一次     *     * @param stormConf     *            The Storm configuration for this bolt.     * @param context     *            This object can be used to get information about this task's     *            place within the topology, including the task id and component     *            id of this task, input and output information, etc.     * @param collector     *            The collector is used to emit tuples from this bolt     */    @Override    public void prepare(Map stormConf, TopologyContext context, OutputCollector collector ) {          this.context = context ;          this.collector = collector ;    }    private long pv = 0;    /**     * Process a single tuple of input.     *     * @param input     *            The input tuple to be processed.     */    @Override    public void execute(Tuple input ) {          try {             String line = input.getStringByField("line" );              if (StringUtils.isNotBlank( line)) {                  pv++;             }              //System.out.println(Thread.currentThread().getName() + "[" + Thread.currentThread().getId() + "]" +context.getThisTaskId()+ "->" + pv);              //this.collector.emit(new Values(Thread.currentThread().getId(),pv));//仅适合一个线程和一个task情况              this.collector .emit(new Values(context.getThisTaskId(),pv ));// 一个线程和1或多个task的情况,TaskId唯一              this.collector .ack(input );         } catch(Exception e ){              e.printStackTrace();              this.collector .fail(input );         }    }    /**     * Declare the output schema for all the streams of this topology.     *     * @param declarer     *            this is used to declare output stream ids, output fields, and     *            whether or not each output stream is a direct stream     */    @Override    public void declareOutputFields(OutputFieldsDeclarer declarer ) {          declarer.declare( new Fields("taskId","pv" ));    }}


 (3)二级bolt,单线程汇总
package com.yun.storm.pv;import java.util.HashMap;import java.util.Map;import java.util.Map.Entry;import org.apache.commons.collections.MapUtils;import org.apache.hadoop.hbase.client.Result;import com.yun.hbase.HBaseUtils;import backtype.storm.task.OutputCollector;import backtype.storm.task.TopologyContext;import backtype.storm.topology.OutputFieldsDeclarer;import backtype.storm.topology.base.BaseRichBolt;import backtype.storm.tuple.Tuple;/*** 汇总PVBolt多个线程的结果* @author shenfl**/public class PVSumBolt extends BaseRichBolt{     /**     *     */     private static final long serialVersionUID = 1L;     private OutputCollector collector;     private Map<Integer,Long> map = new HashMap<Integer,Long>();//<日期,PV数>     @Override     public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {          this.collector = collector;     }     @Override     public void execute(Tuple input) {          try {               Integer taskId = input.getIntegerByField("taskId");               Long pv = input.getLongByField("pv");               map.put(taskId, pv);//map个数为task实例数                             long sum = 0;//获取总数,遍历map 的values,进行sum               for (Entry<Integer, Long> e : map.entrySet()) {                    sum += e.getValue();               }               System.out.println("当前时间:"+System.currentTimeMillis()+"pv汇总结果:" + "->" + sum);               this.collector.ack(input);          }catch(Exception e){               e.printStackTrace();               this.collector.fail(input);          }     }     @Override     public void declareOutputFields(OutputFieldsDeclarer declarer) {              }}



0 0