基于spark的新词发现模型
来源:互联网 发布:淘宝有哪些好的外贸店 编辑:程序博客网 时间:2024/06/11 01:23
很久没有写博客了,很多东西都用为知笔记下来了,一直想写spark、hadoop和机器学习方面的。。。一直没写,还是回归正题吧,同事推荐了一篇论文,基于新词发现的论文的,原文地址,http://www.matrix67.com/blog/archives/5044 ,公司刚刚好友这方面的需求,用了一个礼拜多礼拜的代码来写代码、测试、优化,今天总算能处理所谓的大数据了,几十g的文本不在话下,在spark上面实现,下面给出代码看看吧,欢迎各位高手指正,估计还有一些性能方面可以提升,转载的请注明。
package com.icklick.spark.wordSegmentimport org.apache.log4j.{ Level, Logger }import org.apache.spark.{ SparkConf, SparkContext }import com.iclick.spark.wordSegment.util.CounterMapimport scala.collection.mutable.ArrayBufferimport com.google.common.collect.Mapsimport java.text.SimpleDateFormatimport scala.collection.JavaConversions._import scala.collection.JavaConverters._import scala.collection.mutable.Mapimport com.iclick.spark.wordSegment.util.AtomsUitlimport org.apache.spark.sql.SQLContextimport org.apache.spark.sql.functions._import org.apache.spark.sql.SaveModeimport com.iclick.spark.wordSegment.util.ConterHashSetimport org.apache.commons.lang.StringUtilsimport com.mysql.jdbc.Driver///tmp/yuming/webtable/ds=16-04-17 hadoop数据目录object WordSegment{ def main(args: Array[String]): Unit = { //关闭一些不必要的日志 Logger.getLogger("org.apache.spark").setLevel(Level.WARN) Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF) //master if (args.length < 5) { System.err.println("Usage: path ,maxLen ,pmi, info,shuffle_count") System.exit(1) } val path=args(0).toString val maxLen=args(1).toInt val pmi=args(2).toDouble val info=args(3).toDouble val shuffle_count=args(4).toInt val save_path_result=if(args.length>=6){ args(5).toString} else "/tmp/wilson/" val conf = new SparkConf().set("spark.driver.maxResultSize","10g"). set("spark.sql.shuffle.partitions",s"${shuffle_count}").set("spark.network.timeout","850s"). set("spark.shuffle.compress","true").set("spark.shuffle.spill.compress","true").set("spark.shuffle.manager","sort") if (System.getProperty("local") != null) { conf.setMaster("local").setAppName("wordSegname") } val sc = new SparkContext(conf) val sqlContext=new SQLContext(sc) //local /* val conf = new SparkConf().setAppName("wordSegname").setMaster("local[4]"). set("spark.sql.shuffle.partitions","10").set("spark.network.timeout","30s") .set("spark.shuffle.compress","true").set("spark.shuffle.spill.compress","true") .set("spark.shuffle.manager","sort") val sc = new SparkContext(conf) val sqlContext=new SQLContext(sc) val path="D:\\wilson.zhou\\Downloads\\西游记.txt" val maxLen=6 val path1="D:\\temp\\text.txt" val pmi=0 val info=0 val save_path_result="/tmp/wilson/"*/ // val word=scala.io.Source.fromFile("D:\\wilson.zhou\\Downloads\\红楼梦.txt").getLines().mkString("") val sdf = new java.text.SimpleDateFormat("yyyy-MM-dd:HH:mm:ss") var start=sdf.format(System.currentTimeMillis()) val word1=sc.textFile(path).map{x=> val x_filter=x.replaceAll("[" + AtomsUitl.stopwords + "]", " ").replaceAll("\\p{Punct}", " ").replaceAll("\\pP", " ").replaceAll(" ", " ").replaceAll("\\p{Blank}", " ").replaceAll("\\p{Space}", " ").replaceAll("\\p{Cntrl}", " ")x_filter } val sum_document=word1.count() val word_document=word1.zipWithIndex.filter { x => !StringUtils.isBlank(x._1) }.flatMap{x=> val arr= ArrayBuffer[(String,Int)]() val line=x._1.split(" ") for(i<-line){ arr+=((i,x._2.toInt)) } arr }.map{x=>(x._1.trim,x._2)}.filter(x=> !StringUtils.isBlank(x._1)) println("Calculate the iterms documnt") val word_document_caculate= word_document.map{x=>("$"+ x._1 +"$",x._2)}.flatMap{ x=> var arr=ArrayBuffer[(String,Int)]() for( y<- 1 to AtomsUitl.len(x._1)-2){ arr+=((AtomsUitl.substring(x._1,y, Math.min(maxLen+y,AtomsUitl.len(x._1))),x._2)) } arr }.sortBy(x=>x._1) println("documnet caculate will start") val word_document_result=word_document_caculate.map{ x=> val first=AtomsUitl.substring(x._1, 0, 1) (first,x._1,x._2) }.groupBy((f:(String,String,Int))=>f._1).map{ x=>x._2 }.flatMap{ x=> val documnet=Maps.newHashMap[String,ConterHashSet] var arrBuff=ArrayBuffer[(String,Int)]() for(curr <- x){ for( ii<- 1 to AtomsUitl.len(curr._2)-1){ val w1=AtomsUitl.substring(curr._2, 0,ii) if(documnet.containsKey(w1)){ documnet.get(w1).addelment(curr._3.asInstanceOf[java.lang.Integer]) }else{ val cm=new ConterHashSet(); cm.addelment(curr._3.asInstanceOf[java.lang.Integer]) documnet.put(w1,cm) } } } val documnet_iter=documnet.keySet.iterator while(documnet_iter.hasNext()){ val w=documnet_iter.next() val freq=documnet.get(w).getsize() arrBuff+=((w,freq)) } arrBuff } // word_document_result.take(20).foreach(println)// println("word_document_result's count:"+word_document_result.count()) println("information entropy and information") val word=word1.flatMap{x=> val line=x.split(" ") line }.filter(x=> !StringUtils.isBlank(x)) // //计算左信息熵做准备 println("Calculate the left word information entropy and information entropy .....") val wordleft=word.map(x=>AtomsUitl.reverse(x)).map{x=>"$"+ x +"$"}.flatMap{ x=> var arr=ArrayBuffer[String]() for( y<- 1 to AtomsUitl.len(x)-2){// arr+=x.substring(y, Math.min(maxLen + y, x.length())) arr+=AtomsUitl.substring(x,y, Math.min(maxLen + y, AtomsUitl.len(x))) } arr }.sortBy(x=>x) val wordleft_caculate= wordleft.map{ s=>// val first=s.substring(0, 1).toString() val first=AtomsUitl.substring(s, 0,1).toString (first,s) }.groupBy((f:(String,String))=>f._1).map{ x=>x._2 }.flatMap{ x=> val stat = Maps.newHashMap[String, CounterMap]() var arrBuff=ArrayBuffer[(String,Double)]() for(curr <- x){ for( ii<- 1 to AtomsUitl.len(curr._2)-1){ // val w = curr._2.substring(0,ii) val w = AtomsUitl.substring(curr._2, 0, ii)// val suffix = curr._2.substring(ii).substring(0, 1) val suffix= AtomsUitl.substring(AtomsUitl.substring(curr._2,ii),0,1) if (stat.containsKey(w)) { stat.get(w).incr(suffix) } else { val cm = new CounterMap() cm.incr(suffix) stat.put(w, cm) } } } var iterator_stat=stat.keySet().iterator() while(iterator_stat.hasNext()){ var w=iterator_stat.next() var cm = stat.get(w);var freq = 0var re = 0.0var cm_iter=cm.countAll().keySet().iterator()while(cm_iter.hasNext()) {freq += cm.get(cm_iter.next())}var cm_iter1=cm.countAll().keySet().iterator()while(cm_iter1.hasNext()) {var p = cm.get(cm_iter1.next()) * 1.0 / freqre += -1 * Math.log(p) * p}//print("freq的值是:"+freq+" ")//println("re的值是:"+re)arrBuff+=((AtomsUitl.reverse(w),re)) } arrBuff }// wordleft_caculate.take(20).foreach(println) // println("左邻信息个个数是:"+wordleft_caculate.count())// println(wordleft_caculate.map(x=>x._1).distinct().count()) // println("wordleft'coutn----->"+wordleft.count) //计算右信息熵做准备 println("Calculate the right word information entropy and information entropy .....") val wordright=word.map{x=>"$"+ x +"$"}.flatMap{ x=> var arr=ArrayBuffer[String]()// AtomsUitl.len(x)-2 for( y<- 1 to AtomsUitl.len(x)-2){// arr+=x.substring(y, java.lang.Math.min(maxLen + y, x.length())) arr+=(AtomsUitl.substring(x,y,Math.min(maxLen+y,AtomsUitl.len(x)))) } arr }.sortBy(x=>x) //计算右邻字信息熵 val wordright_caculate=wordright.map{ s=>// val first=s.substring(0, 1).toString() val first=AtomsUitl.substring(s, 0,1).toString() (first,s) }.groupBy((f:(String,String))=>f._1).map{ x=>x._2 }.flatMap{ x=> var stat = Maps.newHashMap[String, CounterMap]() var arrBuff=ArrayBuffer[(String,Int,Double)]() for(curr <- x){ for(i<- 1 to AtomsUitl.len(curr._2)-1){// val w = curr._2.substring(0, i) val w=AtomsUitl.substring(curr._2,0,i) // val suffix = curr._2.substring(i).substring(0, 1) val suffix=AtomsUitl.substring(AtomsUitl.substring(curr._2, i), 0,1).toString if (stat.containsKey(w)) { stat.get(w).incr(suffix); } else { val cm = new CounterMap(); cm.incr(suffix); stat.put(w, cm);} } } var iterator_stat=stat.keySet().iterator() while(iterator_stat.hasNext()){ var w=iterator_stat.next() var cm = stat.get(w);var freq = 0var re = 0.0var cm_iter=cm.countAll().keySet().iterator()while(cm_iter.hasNext()) {freq += cm.get(cm_iter.next())}var cm_iter1=cm.countAll().keySet().iterator()while(cm_iter1.hasNext()) {var p = cm.get(cm_iter1.next()) * 1.0 / freqre += -1 * Math.log(p) * p}//print("w的值是:"+w+" ")//print("freq的值是:"+freq+" ")//println("re的值是"+re)arrBuff+=((w,freq,re)) } arrBuff }// println("计算右邻信息前20条")// wordright_caculate.take(20).foreach(println)// println("右信息表的总共个数:"+wordright_caculate.count()) // wordright_caculate. //左右合并开始 println(" Merge will begin to calculated..............") import sqlContext.implicits._ /* val word_caculate_total1=wordright_caculate.union(wordleft_caculate).sortBy(x=>x).groupBy((f:(String,Int,Double))=>f._1,20).map(x=>x._2) val word_caculate_total= word_caculate_total1.map{ x=> val hashtable=new java.util.Hashtable[String,String]() hashtable.put("name","null") hashtable.put("freq","0") hashtable.put("e",java.lang.Double.MAX_VALUE.toString()) for(str<-x){ hashtable.put("name",str._1) if(str._2!= -20){ hashtable.put("freq",String.valueOf(str._2)) } if(str._3<java.lang.Double.parseDouble(hashtable.get("e"))){ hashtable.put("e",String.valueOf(str._3)) } } (hashtable.get("name") ,hashtable.get("freq").toInt,hashtable.get("e").toDouble) }.filter(x=> !StringUtils.isBlank(x._1) && x._1.length>1)*/ val wordright_caculate_todf= wordright_caculate.toDF("right_name","freq","right_info") val wordleft_caculate_todf= wordleft_caculate.toDF("left_name","left_info") val udf_get_min:((Double,Double)=>Double)=(arg1:Double,arg2:Double)=>Math.min(arg1,arg2) val sqlfunctin=udf(udf_get_min) val word_caculate_total=wordright_caculate_todf.join(wordleft_caculate_todf,wordright_caculate_todf("right_name")===wordleft_caculate_todf("left_name"),"left"). withColumn("info", sqlfunctin(col("right_info"),col("left_info"))).drop("right_info"). drop("left_name").drop("left_info").filter(length(wordright_caculate_todf("right_name"))>1).rdd //wordright_caculate.union(wordleft_caculate).groupBy((f:(String,Int,Double))=>f._1).map(x=>x._2).take(20).foreach(println) println("计算凝固度") val size_pmi=wordright_caculate.count() println("最后步骤中的size的总数是:"+size_pmi) println("map_total has down") //计算凝固度 val last= word_caculate_total.flatMap{ x=> var w=x.apply(0).toString var f=x.apply(1).toString.toInt var e=x.apply(2).toString.toDouble // var w=x._1// var f=x._2// var e=x._3 var arr=ArrayBuffer[(String,Int,Double,String,String)]() for(s <- 1 to AtomsUitl.len(w)-1){// var lw=w.substring(0,s) try{ var lw=AtomsUitl.substring(w, 0,s)// var rw=w.substring(s) var rw=AtomsUitl.substring(w, s) arr+=((w,f,e,lw,rw)) }catch{ case e:Exception=>arr+=(("",0,0.0,"","")) } } arr }.filter(f=> !StringUtils.isBlank(f._4)&& !StringUtils.isBlank(f._5)) println("dataframe merge will begin to calculated..............")// last.take(30).foreach(println) val df= last.toDF("w_total","f","e","lw","rw") val df1=wordright_caculate.toDF("w","freq","re") val df2_drop=df.join(df1,df("lw")===df1("w"),"left").drop("re").drop("w").withColumnRenamed("freq", "lw_freq")// val df2_drop=df2.drop("re").drop("w").withColumnRenamed("freq", "lw_freq") val df3_drop=df2_drop.join(df1,df2_drop("rw")===df1("w"),"left").drop("re").drop("w").withColumnRenamed("freq", "rw_freq")// val df3_drop=df3.drop("re").drop("w").withColumnRenamed("freq", "rw_freq") // 948014 //凝固度計算 /*val result=df3_drop.rdd.groupBy{f=>f(0)}.map{ x=> val map=new java.util.HashMap[String,String]() map.put("max","1") for(i<-x._2){ map.put("w_total",i.apply(0).toString) map.put("f",i.apply(1).toString) map.put("e",i.apply(2).toString) var ff:java.lang.Long=try{ i.apply(5).toString.toLong*i.apply(6).toString.toLong }catch{ case e:Exception=>1l } if(ff>map.get("max").toLong){ map.put("max",ff.toString) } } var pf=map.get("f").toLong*size_pmi*1.0/map.get("max").toLong var pmi=Math.log(pf) var w_total= map.get("w_total") var f=map.get("f").toInt var e=map.get("e").toDouble map.clear() (w_total,f,pmi,e,0)// ( map.get("w_total"),map.get("f").toInt ,pmi,map.get("e").toDouble,0) }.filter(f=>f._3>pmi&& f._4>info&& !StringUtils.isBlank(f._1)) val resultToDf= result.toDF("name","freq","pmi","info","zero") */ println("dataframe join has down") //计算凝聚度 改用DataFrame的形式 val udf_get_pmi=(arg1:Int,arg2:Int,arg3:Int)=>Math.log((arg1.toLong*size_pmi.toLong*1.0)/(arg2.toLong*arg3.toLong)) val udf_get_pmi_udf=udf(udf_get_pmi) val resultToDf=df3_drop.withColumn("pmi",udf_get_pmi_udf(col("f"),col("rw_freq"),col("lw_freq"))).withColumn("zero", col("f")*0). drop("rw_freq").drop("lw_freq").drop("lw").drop("rw").sort($"w_total",$"pmi".desc).dropDuplicates(Array("w_total")). filter($"pmi">pmi && $"e">info).withColumnRenamed("w_total", "name").withColumnRenamed("f", "freq").withColumnRenamed("e", "info") println("The final result will be caculated") val word_document_resultToDf=word_document_result.toDF("name1","document") val resultToDf2= resultToDf.join(word_document_resultToDf,word_document_resultToDf("name1")===resultToDf("name"),"left"). withColumn("documentcount",col("zero")+sum_document).drop("zero").drop("name1")// val resultToDf2 =resultToDf1.withColumn("documentcount",col("zero")+sum_document).drop("zero").drop("name1")// resultToDf2.show(20)// 互信息 凝聚度pmi// 左右熵 e //把结果存入到hdfs中 println("Results will stored into HDFS.") val sdf1=new SimpleDateFormat("yy-MM-dd") val save_path=save_path_result+sdf1.format(System.currentTimeMillis()) try{ resultToDf2.rdd.map{ x=> var name=x.apply(0).toString var freq=x.apply(1).toString var entropy=x.apply(2).toString var info=x.apply(3).toString var document=x.apply(4).toString var documenttotal=x.apply(5).toString s"${name},${freq},${info},${entropy},${document},${documenttotal}" }.saveAsTextFile(save_path) println("....................sucess.............")// resultToDf2.rdd.repartition(1).saveAsTextFile(save_path) }catch{ case e:Exception=>println("some errors happend when sava the last datas") } //把结果插入到mysql数据库中/* val driver="com.mysql.jdbc.Driver" Class.forName(driver) val url ="jdbc:mysql://10.1.1.28:3306/spark" val pro=new java.util.Properties pro.setProperty("user","usr_dba") pro.setProperty("password","4rfv%TGB^YHN") pro.setProperty("use_unicode", "true") pro.setProperty("characterEncoding", "utf8") resultToDf2.write.mode(SaveMode.Overwrite).jdbc(url, "wordsegment",pro) */ println(start) println(sdf.format(System.currentTimeMillis())) sc.stop() }}
0 0
- 基于spark的新词发现模型
- 基于大规模语料的新词发现算法
- 基于大规模语料的新词发现算法
- 基于大规模语料的新词发现算法
- 基于大规模语料的新词发现算法
- 基于大规模语料的新词发现算法
- 基于大规模语料的新词发现算法
- 基于大规模语料的新词发现算法
- 基于大规模语料的新词发现算法
- 基于大规模语料的新词发现算法
- 基于大规模语料的新词发现算法
- 【转】基于大规模语料的新词发现算法
- 【算法设计】基于大规模语料的新词发现算法
- 【算法设计】基于大规模语料的新词发现算法
- [转]基于大规模语料的新词发现算法
- 新词发现及Java和spark实现
- 新词发现
- 新词发现
- FineReport报表和J2EE应用的集成
- [大数据]使用hive做数据分析
- 在VirtualBox中安装Ubuntu 的增强工具包并实现与win7共享文件
- vim 一些命令
- 怎么使链接点击过后不变颜色
- 基于spark的新词发现模型
- android 控件 下拉刷新 ActionBar-PullToRefresh
- Android webView加载html代码详解
- Android Studio项目构建时遇到的常见问题及解决办法
- C++实践参考:点-圆-圆柱类族的设计
- 透彻理解Java动态代理机制
- Hypertable 快速安装,仅需上载一个RPM包,零编译
- java的ArrayList的简单实现
- vim常用command