Bloom filter的实现以及常用的hash函数

bloom filter利用时间换空间的思想，利用多个哈希函数，将一个元素的存在状态映射到多个bit中，特别是在网络环境中，BF具有广泛的用途，关键问题就是要减少false positive rate（可以设置参数来调节），扩展有 counting BF。这里选用的hash函数是表现较好的 BKDRHash , SDBMHash, DJBHash
。

Bloom-filter代码：

bloom_filter.h

#ifndef __BLOOM_FILTER_H__
#define __BLOOM_FILTER_H__

#include<stdlib.h>

typedef unsigned int (*hashfunc_t)(const char *, unsigned int len);

typedef struct {
    size_t asize;
    unsigned char *a; //to store the state of existence bits
    size_t nfuncs;
    hashfunc_t *funcs; // hash funcs
} BLOOM;

BLOOM *bloom_create(size_t size, size_t nfuncs, ...);
int bloom_destroy(BLOOM *bloom);
int bloom_add(BLOOM *bloom, const char *s, unsigned int len);
int bloom_check(BLOOM *bloom, const char *s, unsigned int len);

#endif

bloom_filter.c

#include<limits.h>
#include<stdarg.h>

#include"bloom_filter.h"

#define SETBIT(a, n) (a[n/CHAR_BIT] |= (1<<(n%CHAR_BIT)))
#define GETBIT(a, n) (a[n/CHAR_BIT] & (1<<(n%CHAR_BIT)))

BLOOM *bloom_create(size_t size, size_t nfuncs, ...)
{
    BLOOM *bloom;
    va_list l;
    int n;

    if(!(bloom=malloc(sizeof(BLOOM)))) return NULL;

    // ceil the number of char to malloc for the bloom
    if(!(bloom->a=calloc((size+CHAR_BIT-1)/CHAR_BIT, sizeof(char)))) {
        free(bloom);
        return NULL;
    }
    if(!(bloom->funcs=(hashfunc_t*)malloc(nfuncs*sizeof(hashfunc_t)))) {
        free(bloom->a);
        free(bloom);
        return NULL;
    }

    va_start(l, nfuncs);
    for(n=0; n<nfuncs; ++n) {
        bloom->funcs[n]=va_arg(l, hashfunc_t);
    }
    va_end(l);

    bloom->nfuncs=nfuncs;
    bloom->asize=size;

    return bloom;
}

int bloom_destroy(BLOOM *bloom)
{
    free(bloom->a);
    free(bloom->funcs);
    free(bloom);

    return 0;
}

int bloom_add(BLOOM *bloom, const char *s, unsigned int len)
{
    size_t n;

    for(n=0; n<bloom->nfuncs; ++n) {
        SETBIT(bloom->a, bloom->funcs[n](s, len)%bloom->asize);
    }

    return 0;
}

int bloom_check(BLOOM *bloom, const char *s, unsigned int len)
{
    size_t n;

    for(n=0; n<bloom->nfuncs; ++n) {
        if(!(GETBIT(bloom->a, bloom->funcs[n](s, len)%bloom->asize))) return 0;
    }

    return 1;
}

常用hash函数文件：

#include "global.h"

// Author: Arash Partow - 2002 

unsigned int RSHash(char* str, unsigned int len)
{
   unsigned int b    = 378551;
   unsigned int a    = 63689;
   unsigned int hash = 0;
   unsigned int i    = 0;

   for(i = 0; i < len; str++, i++)
   {
      hash = hash * a + (*str);
      a    = a * b;
   }

   return hash;
}
/* End Of RS Hash Function */

unsigned int JSHash(char* str, unsigned int len)
{
   unsigned int hash = 1315423911;
   unsigned int i    = 0;

   for(i = 0; i < len; str++, i++)
   {
      hash ^= ((hash << 5) + (*str) + (hash >> 2));
   }

   return hash;
}
/* End Of JS Hash Function */

unsigned int PJWHash(char* str, unsigned int len)
{
   const unsigned int BitsInUnsignedInt = (unsigned int)(sizeof(unsigned int) * 8);
   const unsigned int ThreeQuarters     = (unsigned int)((BitsInUnsignedInt  * 3) / 4);
   const unsigned int OneEighth         = (unsigned int)(BitsInUnsignedInt / 8);
   const unsigned int HighBits          = (unsigned int)(0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth);
   unsigned int hash              = 0;
   unsigned int test              = 0;
   unsigned int i                 = 0;

   for(i = 0; i < len; str++, i++)
   {
      hash = (hash << OneEighth) + (*str);

      if((test = hash & HighBits)  != 0)
      {
         hash = (( hash ^ (test >> ThreeQuarters)) & (~HighBits));
      }
   }

   return hash;
}
/* End Of  P. J. Weinberger Hash Function */

unsigned int ELFHash(char* str, unsigned int len)
{
   unsigned int hash = 0;
   unsigned int x    = 0;
   unsigned int i    = 0;

   for(i = 0; i < len; str++, i++)
   {
      hash = (hash << 4) + (*str);
      if((x = hash & 0xF0000000L) != 0)
      {
         hash ^= (x >> 24);
      }
      hash &= ~x;
   }

   return hash;
}
/* End Of ELF Hash Function */

/*This hash function comes from Brian Kernighan and Dennis Ritchie's book "The C Programming Language".
* It is a simple hash function using a strange set of possible seeds which all constitute a pattern
* of 31....31...31 etc, it seems to be very similar to the DJB hash function.
*/
unsigned int BKDRHash(char* str, unsigned int len)
{
   unsigned int seed = 131; /* 31 131 1313 13131 131313 etc.. */
   unsigned int hash = 0;
   unsigned int i    = 0;

   for(i = 0; i < len; str++, i++)
   {
      hash = (hash * seed) + (*str);
   }

   return hash;
}
/* End Of BKDR Hash Function */

/*This is the algorithm of choice which is used in the open source SDBM project.
* The hash function seems to have a good over-all distribution for many different
* data sets. It seems to work well in situations where there is a high variance
* in the MSBs of the elements in a data set.
*/
unsigned int SDBMHash(char* str, unsigned int len)
{
   unsigned int hash = 0;
   unsigned int i    = 0;

   for(i = 0; i < len; str++, i++)
   {
      hash = (*str) + (hash << 6) + (hash << 16) - hash;
   }

   return hash;
}
/* End Of SDBM Hash Function */

/*An algorithm produced by Professor Daniel J. Bernstein and shown first
* to the world on the usenet newsgroup comp.lang.c. It is one of the most
* efficient hash functions ever published.
*/
unsigned int DJBHash(char* str, unsigned int len)
{
   unsigned int hash = 5381;
   unsigned int i    = 0;

   for(i = 0; i < len; str++, i++)
   {
      hash = ((hash << 5) + hash) + (*str);
   }

   return hash;
}
/* End Of DJB Hash Function */

unsigned int DEKHash(char* str, unsigned int len)
{
   unsigned int hash = len;
   unsigned int i    = 0;

   for(i = 0; i < len; str++, i++)
   {
      hash = ((hash << 5) ^ (hash >> 27)) ^ (*str);
   }
   return hash;
}
/* End Of DEK Hash Function */

unsigned int BPHash(char* str, unsigned int len)
{
   unsigned int hash = 0;
   unsigned int i    = 0;
   for(i = 0; i < len; str++, i++)
   {
      hash = hash << 7 ^ (*str);
   }

   return hash;
}
/* End Of BP Hash Function */

unsigned int FNVHash(char* str, unsigned int len)
{
   const unsigned int fnv_prime = 0x811C9DC5;
   unsigned int hash      = 0;
   unsigned int i         = 0;

   for(i = 0; i < len; str++, i++)
   {
      hash *= fnv_prime;
      hash ^= (*str);
   }

   return hash;
}
/* End Of FNV Hash Function */

unsigned int APHash(char* str, unsigned int len)
{
   unsigned int hash = 0xAAAAAAAA;
   unsigned int i    = 0;

   for(i = 0; i < len; str++, i++)
   {
      hash ^= ((i & 1) == 0) ? (  (hash <<  7) ^ (*str) * (hash >> 3)) :
                               (~((hash << 11) + (*str) ^ (hash >> 5)));
   }

   return hash;
}
/* End Of AP Hash Function */

测试程序：这里是先对文件分块，而后对指纹进行BF，以达到节省空间的目的。

#include "global.h"

int main(int argc, char ** argv){
    int i = 0, id = 1, result, count=0;
    int sockfd, fd;
    FileInfo *fi=file_new();
    FingerChunk *p;
    BLOOM *bloom = NULL;

    if(argc != 2)
        err_quit("usage:bloom  <file>");

    //creat a bloom filter .TODO what size is best??
    if(!(bloom=bloom_create(2500000, 3, BKDRHash, SDBMHash, DJBHash))) {
        fprintf(stderr, "ERROR: Could not create bloom filter\n");
        return EXIT_FAILURE;
    }

    // chunk the file
    strcpy(fi->file_path, argv[1]);
    chunk_file(fi);

    printf("File size : %lld\n",fi->file_size);
    printf("Chunk Num : %d\n",fi->chunknum);

    p= fi->first;
    while(p){
        char hash[41];
        digestToHash(p->chunk_hash,hash);
        printf("chunklen : %d , Fingerprint : %s\n", p->chunklen, hash); // just
        //add each signature to the bloom filter
        bloom_add(bloom, p->chunk_hash, 20);  //
        p=p->next;
    }

    //How to test the false positive ??
    p = fi->first;
    printf("1.BKDR Hash Function value : %u\n", BKDRHash(p->chunk_hash, 20));
    printf("2.SDBM Hash Function value : %u\n", SDBMHash(p->chunk_hash, 20));
    printf("3.DJB Hash Function value : %u\n", DJBHash(p->chunk_hash, 20));
    bloom_destroy(bloom);
    file_free(fi);
}

参考：

1. http://www.cnblogs.com/-clq/archive/2012/05/31/2528153.html

2. http://www.partow.net/programming/hashfunctions/#top

Bloom filter的实现以及常用的hash函数,布布扣,bubuko.com

时间： 2024-12-24 04:21:59

Bloom filter的实现以及常用的hash函数的相关文章

常用字符串Hash函数

几个常用的字符串Hash函数如下: SDBMHash函数 unsigned int SDBMHash(char *str) { unsigned int hash = 0; while (*str) { // equivalent to: hash = 65599*hash + (*str++); hash = (*str++) + (hash << 6) + (hash << 16) - hash; } return (hash & 0x7FFFFFFF); } RSHa

bloom filter 详解［转］

Bloom Filter概念和原理焦萌 2007年1月27日 Bloom Filter是一种空间效率很高的随机数据结构,它利用位数组很简洁地表示一个集合,并能判断一个元素是否属于这个集合.Bloom Filter的这种高效是有一定代价的:在判断一个元素是否属于某个集合时,有可能会把不属于这个集合的元素误认为属于这个集合(false positive).因此,Bloom Filter不适合那些“零错误”的应用场合.而在能容忍低错误率的应用场合下,Bloom Filter通过极少的错误换取了存储空

Bloom Filter概念和原理

Bloom Filter是一种空间效率很高的随机数据结构,它利用位数组很简洁地表示一个集合,并能判断一个元素是否属于这个集合.Bloom Filter的这种高效是有一定代价的:在判断一个元素是否属于某个集合时,有可能会把不属于这个集合的元素误认为属于这个集合(false positive).因此,Bloom Filter不适合那些“零错误”的应用场合.而在能容忍低错误率的应用场合下,Bloom Filter通过极少的错误换取了存储空间的极大节省. 集合表示和元素查询下面我们具体来看Bloom

Bloom Filter布隆过滤器

http://blog.csdn.net/pipisorry/article/details/64127666 Bloom Filter简介 Bloom Filter是一种空间效率很高的随机数据结构,它利用位数组很简洁地表示一个集合,并能判断一个元素是否属于这个集合.布隆过滤器(英语:Bloom Filter)是1970年由布隆提出的.它实际上是一个很长的二进制向量和一系列随机映射函数.布隆过滤器可以用于检索一个元素是否在一个集合中.它的优点是空间效率和查询时间都远远超过一般的算法,缺点是有一定

Bloom Filter概念和原理【转】

Bloom Filter概念和原理 Bloom Filter是一种空间效率很高的随机数据结构,它利用位数组很简洁地表示一个集合,并能判断一个元素是否属于这个集合.Bloom Filter的这种高效是有一定代价的:在判断一个元素是否属于某个集合时,有可能会把不属于这个集合的元素误认为属于这个集合(false positive).因此,Bloom Filter不适合那些“零错误”的应用场合.而在能容忍低错误率的应用场合下,Bloom Filter通过极少的错误换取了存储空间的极大节省. 集合表示和元

基于BitSet的布隆过滤器(Bloom Filter)

布隆过滤器 Bloom Filter 是由Howard Bloom 在 1970 年提出的二进制向量数据结构,它具有很好的空间和时间效率,被用来检测一个元素是不是集合中的一个成员.如果检测结果为是,该元素不一定在集合中:但如果检测结果为否,该元素一定不在集合中.因此Bloom filter具有100%的召回率.这样每个检测请求返回有"在集合内(可能错误)"和"不在集合内(绝对不在集合内)"两种情况,可见 Bloom filter 是牺牲了正确率和时间以节省空间. 当

Bloom Filter 算法简介 (增加 Counting Bloom Filter 内容)

Bloom Filter的中文翻译叫做布隆过滤器,是1970年由布隆提出的.它实际上是一个很长的二进制向量和一系列随机映射函数.布隆过滤器可以用于检索一个元素是否在一个集合中.它的优点是空间效率和查询时间都远远超过一般的算法,缺点是有一定的误识别率和删除困难.如文章标题所述,本文只是做简单介绍,属于科普文章. 应用场景在正式介绍Bloom Filter算法之前,先来看看什么时候需要用到Bloom Filter算法.1. HTTP缓存服务器.Web爬虫等主要工作是判断一条URL是否在现有的URL集

php实现Bloom Filter

Bloom Filter(BF) 是由Bloom在1970年提出的一种多哈希函数映射的快速查找算法,用于快速查找某个元素是否属于集合, 但不要求百分百的准确率. Bloom filter通常用于爬虫的url去重,即判断某个url是否已经被爬过. 原理方面我引用一篇别人的文章,讲的比较清晰了,在此我不予赘述, 更多信息可以参考其论文. 看过几个php实现的BF,都觉得可读性不是很强, 本文主要给出我对Bloom Filter的一个php实现. 原理: <引用自这篇文章> 一. 实例为了说明Bl

Bloom Filter 大规模数据处理利器

BloomFilter–大规模数据处理利器 Bloom Filter是由Bloom在1970年提出的一种多哈希函数映射的快速查找算法.通常应用在一些需要快速判断某个元素是否属于集合,但是并不严格要求100%正确的场合. 一. 实例为了说明Bloom Filter存在的重要意义,举一个实例: 假设要你写一个网络爬虫程序(web crawler).由于网络间的链接错综复杂,爬虫在网络间爬行很可能会形成“环”.为了避免形成“环”,就需要知道爬虫程序已经访问过那些URL.给一个URL,怎样知道爬虫程序