bloom filter简单实现

来源:互联网 发布:淘宝网天猫投诉电话 编辑:程序博客网 时间:2024/06/10 03:04

再流计算中计算UV是个相当麻烦的事情,特别数据量很大的时候,中间存储就大的吓人。最近项目中遇到分类目计算UV,UV量大概在7000W,有20w多个类目。如果使用简单的存储中间结果再去重,如果使用内存内存打不下,使用Hbase的话HBASE的吞吐又不够。于是准备使用bloom近似计算UV。


写了个bloom filter的demo程序,由于uid都为数字在计算hash值时碰撞率比较搞,于是没有直接对uid使用bloom filter而是对uid 的md5值使用bloom filter:

import java.io.UnsupportedEncodingException;import java.security.MessageDigest;import java.security.NoSuchAlgorithmException;import java.util.BitSet;public class  SimpleBloomFilter {     private static final  int  DEFAULT_SIZE  = 8 << 50000; //2<<60000 ;//     private static final  int [] seeds =new  int []{5,7, 11 , 13 , 31 , 37 , 61};     private static final int[] seeds = new int []{13,31, 131, 1313, 13131, 131313};     private  BitSet bits= new  BitSet(DEFAULT_SIZE);     private  SimpleHash[]  func=new  SimpleHash[seeds.length];//     private SimpleHash[]  func = new SimpleHash();     public static void  main(String[] args) {//        String value  = "stone2083@yahoo.cn" ;     System.out.println(DEFAULT_SIZE);    int count=0;    SimpleBloomFilter filter=new  SimpleBloomFilter();    for(int i = 10000;i<60000;i++){    String a = String.valueOf(i);    String value = getMD5Str(a);    if(!(filter.contains(value))){    count++;    filter.add(value);    }        }        System.out.println("result is :"+count);            }     public  SimpleBloomFilter() {         for( int  i= 0 ; i< seeds.length; i ++ ) {            func[i]=new  SimpleHash(DEFAULT_SIZE, seeds[i]);        }    }     public void  add(String value) {         for(SimpleHash f : func) {         System.out.println(f.hash(value));            bits.set(f.hash(value),  true );        }    }     public boolean  contains(String value) {         if(value ==null ) {             return false ;        }         boolean  ret  = true ;         for(SimpleHash f : func) {            ret=ret&& bits.get(f.hash(value));        }         return  ret;    }          private static String getMD5Str(String str) {           MessageDigest messageDigest = null;              try {               messageDigest = MessageDigest.getInstance("MD5");                  messageDigest.reset();                  messageDigest.update(str.getBytes("UTF-8"));           } catch (NoSuchAlgorithmException e) {               System.out.println("NoSuchAlgorithmException caught!");               System.exit(-1);           } catch (UnsupportedEncodingException e) {               e.printStackTrace();           }              byte[] byteArray = messageDigest.digest();              StringBuffer md5StrBuff = new StringBuffer();              for (int i = 0; i < byteArray.length; i++) {                           if (Integer.toHexString(0xFF & byteArray[i]).length() == 1)                   md5StrBuff.append("0").append(Integer.toHexString(0xFF & byteArray[i]));               else                   md5StrBuff.append(Integer.toHexString(0xFF & byteArray[i]));           }              return md5StrBuff.toString();       }            public static class SimpleHash {         private int  cap;         private int  seed;         public  SimpleHash( int cap, int seed) {             this.cap= cap;             this.seed =seed;        }         public int hash(String value) {             int  result=0 ;             int  len= value.length();             for  (int i= 0 ; i< len; i ++ ) {//                result =seed* result + value.charAt(i);             result =seed* result + value.charAt(i);            }//             return (cap - 1 ) & (result/20);             return (cap - 1 ) & result;//             return result;        }    }}