布隆过滤器
Bloom Filter 是由Howard Bloom 在 1970 年提出的二进制向量数据结构,它具有很好的空间和时间效率,被用来检测一个元素是不是集合中的一个成员。如果检测结果为是,该元素不一定在集合中;但如果检测结果为否,该元素一定不在集合中。因此Bloom filter具有100%的召回率。这样每个检测请求返回有“在集合内(可能错误)”和“不在集合内(绝对不在集合内)”两种情况,可见 Bloom filter 是牺牲了正确率和时间以节省空间。
当然布隆过滤器也有缺点,主要是误判的问题,随着数据量的增加,误判率也随着增大,解决办法:可以建立一个列表,保存哪些数值是容易被误算的。
下面转一个我个人认为实现的比较好的BloomFilter:
public class BloomFilter<E> implements Serializable { private BitSet bitset; private int bitSetSize; private double bitsPerElement; private int expectedNumberOfFilterElements; // expected (maximum) number of elements to be added private int numberOfAddedElements; // number of elements actually added to the Bloom filter private int k; // number of hash functions static final Charset charset = Charset.forName("UTF-8"); // encoding used for storing hash values as strings static final String hashName = "MD5"; // MD5 gives good enough accuracy in most circumstances. Change to SHA1 if it's needed static final MessageDigest digestFunction; static { // The digest method is reused between instances MessageDigest tmp; try { tmp = java.security.MessageDigest.getInstance(hashName); } catch (NoSuchAlgorithmException e) { tmp = null; } digestFunction = tmp; } /** * Constructs an empty Bloom filter. The total length of the Bloom filter will be * c*n. * * @param c is the number of bits used per element. * @param n is the expected number of elements the filter will contain. * @param k is the number of hash functions used. */ public BloomFilter(double c, int n, int k) { this.expectedNumberOfFilterElements = n; this.k = k; this.bitsPerElement = c; this.bitSetSize = (int)Math.ceil(c * n); numberOfAddedElements = 0; this.bitset = new BitSet(bitSetSize); } /** * Constructs an empty Bloom filter. The optimal number of hash functions (k) is estimated from the total size of the Bloom * and the number of expected elements. * * @param bitSetSize defines how many bits should be used in total for the filter. * @param expectedNumberOElements defines the maximum number of elements the filter is expected to contain. */ public BloomFilter(int bitSetSize, int expectedNumberOElements) { this(bitSetSize / (double)expectedNumberOElements, expectedNumberOElements, (int) Math.round((bitSetSize / (double)expectedNumberOElements) * Math.log(2.0))); } /** * Constructs an empty Bloom filter with a given false positive probability. The number of bits per * element and the number of hash functions is estimated * to match the false positive probability. * * @param falsePositiveProbability is the desired false positive probability. * @param expectedNumberOfElements is the expected number of elements in the Bloom filter. */ public BloomFilter(double falsePositiveProbability, int expectedNumberOfElements) { this(Math.ceil(-(Math.log(falsePositiveProbability) / Math.log(2))) / Math.log(2), // c = k / ln(2) expectedNumberOfElements, (int)Math.ceil(-(Math.log(falsePositiveProbability) / Math.log(2)))); // k = ceil(-log_2(false prob.)) } /** * Construct a new Bloom filter based on existing Bloom filter data. * * @param bitSetSize defines how many bits should be used for the filter. * @param expectedNumberOfFilterElements defines the maximum number of elements the filter is expected to contain. * @param actualNumberOfFilterElements specifies how many elements have been inserted into the <code>filterData</code> BitSet. * @param filterData a BitSet representing an existing Bloom filter. */ public BloomFilter(int bitSetSize, int expectedNumberOfFilterElements, int actualNumberOfFilterElements, BitSet filterData) { this(bitSetSize, expectedNumberOfFilterElements); this.bitset = filterData; this.numberOfAddedElements = actualNumberOfFilterElements; } /** * Generates a digest based on the contents of a String. * * @param val specifies the input data. * @param charset specifies the encoding of the input data. * @return digest as long. */ public static int createHash(String val, Charset charset) { return createHash(val.getBytes(charset)); } /** * Generates a digest based on the contents of a String. * * @param val specifies the input data. The encoding is expected to be UTF-8. * @return digest as long. */ public static int createHash(String val) { return createHash(val, charset); } /** * Generates a digest based on the contents of an array of bytes. * * @param data specifies input data. * @return digest as long. */ public static int createHash(byte[] data) { return createHashes(data, 1)[0]; } /** * Generates digests based on the contents of an array of bytes and splits the result into 4-byte int's and store them in an array. The * digest function is called until the required number of int's are produced. For each call to digest a salt * is prepended to the data. The salt is increased by 1 for each call. * * @param data specifies input data. * @param hashes number of hashes/int's to produce. * @return array of int-sized hashes */ public static int[] createHashes(byte[] data, int hashes) { int[] result = new int[hashes]; int k = 0; byte salt = 0; while (k < hashes) { byte[] digest; synchronized (digestFunction) { digestFunction.update(salt); salt++; digest = digestFunction.digest(data); } for (int i = 0; i < digest.length/4 && k < hashes; i++) { int h = 0; for (int j = (i*4); j < (i*4)+4; j++) { h <<= 8; h |= ((int) digest[j]) & 0xFF; } result[k] = h; k++; } } return result; } /** * Compares the contents of two instances to see if they are equal. * * @param obj is the object to compare to. * @return True if the contents of the objects are equal. */ @Override public boolean equals(Object obj) { if (obj == null) { return false; } if (getClass() != obj.getClass()) { return false; } final BloomFilter<E> other = (BloomFilter<E>) obj; if (this.expectedNumberOfFilterElements != other.expectedNumberOfFilterElements) { return false; } if (this.k != other.k) { return false; } if (this.bitSetSize != other.bitSetSize) { return false; } if (this.bitset != other.bitset && (this.bitset == null || !this.bitset.equals(other.bitset))) { return false; } return true; } /** * Calculates a hash code for this class. * @return hash code representing the contents of an instance of this class. */ @Override public int hashCode() { int hash = 7; hash = 61 * hash + (this.bitset != null ? this.bitset.hashCode() : 0); hash = 61 * hash + this.expectedNumberOfFilterElements; hash = 61 * hash + this.bitSetSize; hash = 61 * hash + this.k; return hash; } /** * Calculates the expected probability of false positives based on * the number of expected filter elements and the size of the Bloom filter. * <br /><br /> * The value returned by this method is the <i>expected</i> rate of false * positives, assuming the number of inserted elements equals the number of * expected elements. If the number of elements in the Bloom filter is less * than the expected value, the true probability of false positives will be lower. * * @return expected probability of false positives. */ public double expectedFalsePositiveProbability() { return getFalsePositiveProbability(expectedNumberOfFilterElements); } /** * Calculate the probability of a false positive given the specified * number of inserted elements. * * @param numberOfElements number of inserted elements. * @return probability of a false positive. */ public double getFalsePositiveProbability(double numberOfElements) { // (1 - e^(-k * n / m)) ^ k return Math.pow((1 - Math.exp(-k * (double) numberOfElements / (double) bitSetSize)), k); } /** * Get the current probability of a false positive. The probability is calculated from * the size of the Bloom filter and the current number of elements added to it. * * @return probability of false positives. */ public double getFalsePositiveProbability() { return getFalsePositiveProbability(numberOfAddedElements); } /** * Returns the value chosen for K.<br /> * <br /> * K is the optimal number of hash functions based on the size * of the Bloom filter and the expected number of inserted elements. * * @return optimal k. */ public int getK() { return k; } /** * Sets all bits to false in the Bloom filter. */ public void clear() { bitset.clear(); numberOfAddedElements = 0; } /** * Adds an object to the Bloom filter. The output from the object's * toString() method is used as input to the hash functions. * * @param element is an element to register in the Bloom filter. */ public void add(E element) { add(element.toString().getBytes(charset)); } /** * Adds an array of bytes to the Bloom filter. * * @param bytes array of bytes to add to the Bloom filter. */ public void add(byte[] bytes) { int[] hashes = createHashes(bytes, k); for (int hash : hashes) bitset.set(Math.abs(hash % bitSetSize), true); numberOfAddedElements ++; } /** * Adds all elements from a Collection to the Bloom filter. * @param c Collection of elements. */ public void addAll(Collection<? extends E> c) { for (E element : c) add(element); } /** * Returns true if the element could have been inserted into the Bloom filter. * Use getFalsePositiveProbability() to calculate the probability of this * being correct. * * @param element element to check. * @return true if the element could have been inserted into the Bloom filter. */ public boolean contains(E element) { return contains(element.toString().getBytes(charset)); } /** * Returns true if the array of bytes could have been inserted into the Bloom filter. * Use getFalsePositiveProbability() to calculate the probability of this * being correct. * * @param bytes array of bytes to check. * @return true if the array could have been inserted into the Bloom filter. */ public boolean contains(byte[] bytes) { int[] hashes = createHashes(bytes, k); for (int hash : hashes) { if (!bitset.get(Math.abs(hash % bitSetSize))) { return false; } } return true; } /** * Returns true if all the elements of a Collection could have been inserted * into the Bloom filter. Use getFalsePositiveProbability() to calculate the * probability of this being correct. * @param c elements to check. * @return true if all the elements in c could have been inserted into the Bloom filter. */ public boolean containsAll(Collection<? extends E> c) { for (E element : c) if (!contains(element)) return false; return true; } /** * Read a single bit from the Bloom filter. * @param bit the bit to read. * @return true if the bit is set, false if it is not. */ public boolean getBit(int bit) { return bitset.get(bit); } /** * Set a single bit in the Bloom filter. * @param bit is the bit to set. * @param value If true, the bit is set. If false, the bit is cleared. */ public void setBit(int bit, boolean value) { bitset.set(bit, value); } /** * Return the bit set used to store the Bloom filter. * @return bit set representing the Bloom filter. */ public BitSet getBitSet() { return bitset; } /** * Returns the number of bits in the Bloom filter. Use count() to retrieve * the number of inserted elements. * * @return the size of the bitset used by the Bloom filter. */ public int size() { return this.bitSetSize; } /** * Returns the number of elements added to the Bloom filter after it * was constructed or after clear() was called. * * @return number of elements added to the Bloom filter. */ public int count() { return this.numberOfAddedElements; } /** * Returns the expected number of elements to be inserted into the filter. * This value is the same value as the one passed to the constructor. * * @return expected number of elements. */ public int getExpectedNumberOfElements() { return expectedNumberOfFilterElements; } /** * Get expected number of bits per element when the Bloom filter is full. This value is set by the constructor * when the Bloom filter is created. See also getBitsPerElement(). * * @return expected number of bits per element. */ public double getExpectedBitsPerElement() { return this.bitsPerElement; } /** * Get actual number of bits per element based on the number of elements that have currently been inserted and the length * of the Bloom filter. See also getExpectedBitsPerElement(). * * @return number of bits per element. */ public double getBitsPerElement() { return this.bitSetSize / (double)numberOfAddedElements; } }
BitSet的基本原理
最后再了解一下BitSet的基本原理,BitSet是位操作的对象,值只有0或1,内部实现是一个long数组,初始只有一个long数组,所以BitSet最小的size是64,当存储的数据增加,初始化的Long数组已经无法满足时,BitSet内部会动态扩充,最终内部是由N个long来存储,BitSet的内部扩充和List,Set,Map等得实现差不多,而且都是对于用户透明的。
1G的空间,有 8*1024*1024*1024=8589934592bit,也就是可以表示85亿个不同的数。
BitSet用1位来表示一个数据是否出现过,0为没有出现过,1表示出现过。在long型数组中的一个元素可以存放64个数组,因为Java的long占8个byte=64bit,具体的实现,看看源码:
首先看看set方法的实现:
public void set(int bitIndex) { if (bitIndex < 0) //set的数不能小于0 throw new IndexOutOfBoundsException("bitIndex < 0: " + bitIndex); int wordIndex = wordIndex(bitIndex);//将bitIndex右移6位,这样可以保证每64个数字在long型数组中可以占一个坑。 expandTo(wordIndex); words[wordIndex] |= (1L << bitIndex); // Restores invariants checkInvariants(); }
get命令实现:
public boolean get(int bitIndex) { if (bitIndex < 0) throw new IndexOutOfBoundsException("bitIndex < 0: " + bitIndex); checkInvariants(); int wordIndex = wordIndex(bitIndex);//和get一样获取数字在long型数组的那个位置。 return (wordIndex < wordsInUse) && ((words[wordIndex] & (1L << bitIndex)) != 0);//在指定long型数组元素中获取值。 }
BitSet容量动态扩展:
private void ensureCapacity(int wordsRequired) { if (words.length < wordsRequired) { // Allocate larger of doubled size or required size int request = Math.max(2 * words.length, wordsRequired);//默认是扩大一杯的容量,如果传入的数字大于两倍的,则以传入的为准。 // wordsRequired = 传入的数值右移6位 + 1 words = Arrays.copyOf(words, request); sizeIsSticky = false; } }
BitSet中实现了Cloneable接口,并定义在表中列出的方法:
SN | Methods with 描述 |
---|---|
1 |
void and(BitSet bitSet)
与运算调用的内容BitSet中对象与那些指定bitSet。结果存放到调用对象。 |
2 |
void andNot(BitSet bitSet)
对于bitSet每1位,在调用BitSet中的相应位清零。 |
3 |
int cardinality( )
返回BitSet的容量。 |
4 |
void clear( )
所有位清零。 |
5 |
void clear(int index)
index指定的位清零。 |
6 |
void clear(int startIndex, int endIndex)
将从startIndex到endIndex清零。 |
7 |
Object clone( )
重复调用BitSet中对象。 |
8 |
boolean equals(Object bitSet)
返回true如果调用位设置相当于一个在bitSet通过。否则,该方法返回false。 |
9 |
void flip(int index)
逆转由index指定的位。 |
10 |
void flip(int startIndex, int endIndex)
反转将从startIndex位到endIndex. |
11 |
boolean get(int index)
返回指定索引处的位的当前状态。 |
12 |
BitSet get(int startIndex, int endIndex)
返回一个BitSet中,它包含的比特将从startIndex到endIndex.1。调用对象不被改变。 |
13 |
int hashCode( )
返回调用对象的哈希代码。 |
14 |
boolean intersects(BitSet bitSet)
如果至少有一个对调用对象和bitSet内相应位为1,则返回true。 |
15 |
boolean isEmpty( )
返回true如果在调用对象中的所有位均为零。 |
16 |
int length( )
返回到持有调用BitSet中的内容所需的比特数。这个值是由最后1位的位置决定的。 |
17 |
int nextClearBit(int startIndex)
返回下个清零位的索引,(即,下一个零位),从由startIndex指定的索引开始 |
18 |
int nextSetBit(int startIndex)
返回下一组位(即,下一个1比特)的索引,从由startIndex指定的索引开始。如果没有位被设置,则返回1。 |
19 |
void or(BitSet bitSet)
OR值调用的内容BitSet中对象,通过BitSet指定。结果被放置到调用对象。 |
20 |
void set(int index)
设置由index指定的位。 |
21 |
void set(int index, boolean v)
设置由index指定在v. true为传递的值的位设置位,false则清除该位。 |
22 |
void set(int startIndex, int endIndex)
设置位将从startIndex到endIndex.1。 |
23 |
void set(int startIndex, int endIndex, boolean v)
设置位从startIndex到endIndex.1,在真正传递的值v设置位,清除位为false。 |
24 |
int size( )
返回位在调用BitSet中对象的数量。 |
25 |
String toString( )
返回字符串相当于调用BitSet中的对象。 |
26 |
void xor(BitSet bitSet)
在异或调用BitSet中对象的内容与由BitSet指定。结果存放到调用对象。 |
BloomFilter的使用场景
1,爬虫的URL过滤。
2,日志分析
3,用户数统计等等等
总之使用布隆过滤器应该是可能容忍小概率误判的场景,不然慎用。。。
版权声明:本文为博主原创文章,未经博主允许不得转载。