bloom filter利用时间换空间的思想,利用多个哈希函数,将一个元素的存在状态映射到多个bit中,特别是在网络环境中,BF具有广泛的用途,关键问题就是要减少false positive rate(可以设置参数来调节),扩展有 counting BF。这里选用的hash函数是表现较好的 BKDRHash , SDBMHash, DJBHash
。
Bloom-filter代码:
bloom_filter.h
#ifndef __BLOOM_FILTER_H__ #define __BLOOM_FILTER_H__ #include<stdlib.h> typedef unsigned int (*hashfunc_t)(const char *, unsigned int len); typedef struct { size_t asize; unsigned char *a; //to store the state of existence bits size_t nfuncs; hashfunc_t *funcs; // hash funcs } BLOOM; BLOOM *bloom_create(size_t size, size_t nfuncs, ...); int bloom_destroy(BLOOM *bloom); int bloom_add(BLOOM *bloom, const char *s, unsigned int len); int bloom_check(BLOOM *bloom, const char *s, unsigned int len); #endif
bloom_filter.c
#include<limits.h> #include<stdarg.h> #include"bloom_filter.h" #define SETBIT(a, n) (a[n/CHAR_BIT] |= (1<<(n%CHAR_BIT))) #define GETBIT(a, n) (a[n/CHAR_BIT] & (1<<(n%CHAR_BIT))) BLOOM *bloom_create(size_t size, size_t nfuncs, ...) { BLOOM *bloom; va_list l; int n; if(!(bloom=malloc(sizeof(BLOOM)))) return NULL; // ceil the number of char to malloc for the bloom if(!(bloom->a=calloc((size+CHAR_BIT-1)/CHAR_BIT, sizeof(char)))) { free(bloom); return NULL; } if(!(bloom->funcs=(hashfunc_t*)malloc(nfuncs*sizeof(hashfunc_t)))) { free(bloom->a); free(bloom); return NULL; } va_start(l, nfuncs); for(n=0; n<nfuncs; ++n) { bloom->funcs[n]=va_arg(l, hashfunc_t); } va_end(l); bloom->nfuncs=nfuncs; bloom->asize=size; return bloom; } int bloom_destroy(BLOOM *bloom) { free(bloom->a); free(bloom->funcs); free(bloom); return 0; } int bloom_add(BLOOM *bloom, const char *s, unsigned int len) { size_t n; for(n=0; n<bloom->nfuncs; ++n) { SETBIT(bloom->a, bloom->funcs[n](s, len)%bloom->asize); } return 0; } int bloom_check(BLOOM *bloom, const char *s, unsigned int len) { size_t n; for(n=0; n<bloom->nfuncs; ++n) { if(!(GETBIT(bloom->a, bloom->funcs[n](s, len)%bloom->asize))) return 0; } return 1; }
常用hash函数文件:
#include "global.h" // Author: Arash Partow - 2002 unsigned int RSHash(char* str, unsigned int len) { unsigned int b = 378551; unsigned int a = 63689; unsigned int hash = 0; unsigned int i = 0; for(i = 0; i < len; str++, i++) { hash = hash * a + (*str); a = a * b; } return hash; } /* End Of RS Hash Function */ unsigned int JSHash(char* str, unsigned int len) { unsigned int hash = 1315423911; unsigned int i = 0; for(i = 0; i < len; str++, i++) { hash ^= ((hash << 5) + (*str) + (hash >> 2)); } return hash; } /* End Of JS Hash Function */ unsigned int PJWHash(char* str, unsigned int len) { const unsigned int BitsInUnsignedInt = (unsigned int)(sizeof(unsigned int) * 8); const unsigned int ThreeQuarters = (unsigned int)((BitsInUnsignedInt * 3) / 4); const unsigned int OneEighth = (unsigned int)(BitsInUnsignedInt / 8); const unsigned int HighBits = (unsigned int)(0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth); unsigned int hash = 0; unsigned int test = 0; unsigned int i = 0; for(i = 0; i < len; str++, i++) { hash = (hash << OneEighth) + (*str); if((test = hash & HighBits) != 0) { hash = (( hash ^ (test >> ThreeQuarters)) & (~HighBits)); } } return hash; } /* End Of P. J. Weinberger Hash Function */ unsigned int ELFHash(char* str, unsigned int len) { unsigned int hash = 0; unsigned int x = 0; unsigned int i = 0; for(i = 0; i < len; str++, i++) { hash = (hash << 4) + (*str); if((x = hash & 0xF0000000L) != 0) { hash ^= (x >> 24); } hash &= ~x; } return hash; } /* End Of ELF Hash Function */ /*This hash function comes from Brian Kernighan and Dennis Ritchie's book "The C Programming Language". * It is a simple hash function using a strange set of possible seeds which all constitute a pattern * of 31....31...31 etc, it seems to be very similar to the DJB hash function. */ unsigned int BKDRHash(char* str, unsigned int len) { unsigned int seed = 131; /* 31 131 1313 13131 131313 etc.. */ unsigned int hash = 0; unsigned int i = 0; for(i = 0; i < len; str++, i++) { hash = (hash * seed) + (*str); } return hash; } /* End Of BKDR Hash Function */ /*This is the algorithm of choice which is used in the open source SDBM project. * The hash function seems to have a good over-all distribution for many different * data sets. It seems to work well in situations where there is a high variance * in the MSBs of the elements in a data set. */ unsigned int SDBMHash(char* str, unsigned int len) { unsigned int hash = 0; unsigned int i = 0; for(i = 0; i < len; str++, i++) { hash = (*str) + (hash << 6) + (hash << 16) - hash; } return hash; } /* End Of SDBM Hash Function */ /*An algorithm produced by Professor Daniel J. Bernstein and shown first * to the world on the usenet newsgroup comp.lang.c. It is one of the most * efficient hash functions ever published. */ unsigned int DJBHash(char* str, unsigned int len) { unsigned int hash = 5381; unsigned int i = 0; for(i = 0; i < len; str++, i++) { hash = ((hash << 5) + hash) + (*str); } return hash; } /* End Of DJB Hash Function */ unsigned int DEKHash(char* str, unsigned int len) { unsigned int hash = len; unsigned int i = 0; for(i = 0; i < len; str++, i++) { hash = ((hash << 5) ^ (hash >> 27)) ^ (*str); } return hash; } /* End Of DEK Hash Function */ unsigned int BPHash(char* str, unsigned int len) { unsigned int hash = 0; unsigned int i = 0; for(i = 0; i < len; str++, i++) { hash = hash << 7 ^ (*str); } return hash; } /* End Of BP Hash Function */ unsigned int FNVHash(char* str, unsigned int len) { const unsigned int fnv_prime = 0x811C9DC5; unsigned int hash = 0; unsigned int i = 0; for(i = 0; i < len; str++, i++) { hash *= fnv_prime; hash ^= (*str); } return hash; } /* End Of FNV Hash Function */ unsigned int APHash(char* str, unsigned int len) { unsigned int hash = 0xAAAAAAAA; unsigned int i = 0; for(i = 0; i < len; str++, i++) { hash ^= ((i & 1) == 0) ? ( (hash << 7) ^ (*str) * (hash >> 3)) : (~((hash << 11) + (*str) ^ (hash >> 5))); } return hash; } /* End Of AP Hash Function */
测试程序:这里是先对文件分块,而后对指纹进行BF,以达到节省空间的目的。
#include "global.h" int main(int argc, char ** argv){ int i = 0, id = 1, result, count=0; int sockfd, fd; FileInfo *fi=file_new(); FingerChunk *p; BLOOM *bloom = NULL; if(argc != 2) err_quit("usage:bloom <file>"); //creat a bloom filter .TODO what size is best?? if(!(bloom=bloom_create(2500000, 3, BKDRHash, SDBMHash, DJBHash))) { fprintf(stderr, "ERROR: Could not create bloom filter\n"); return EXIT_FAILURE; } // chunk the file strcpy(fi->file_path, argv[1]); chunk_file(fi); printf("File size : %lld\n",fi->file_size); printf("Chunk Num : %d\n",fi->chunknum); p= fi->first; while(p){ char hash[41]; digestToHash(p->chunk_hash,hash); printf("chunklen : %d , Fingerprint : %s\n", p->chunklen, hash); // just //add each signature to the bloom filter bloom_add(bloom, p->chunk_hash, 20); // p=p->next; } //How to test the false positive ?? p = fi->first; printf("1.BKDR Hash Function value : %u\n", BKDRHash(p->chunk_hash, 20)); printf("2.SDBM Hash Function value : %u\n", SDBMHash(p->chunk_hash, 20)); printf("3.DJB Hash Function value : %u\n", DJBHash(p->chunk_hash, 20)); bloom_destroy(bloom); file_free(fi); }
参考:
1. http://www.cnblogs.com/-clq/archive/2012/05/31/2528153.html
2. http://www.partow.net/programming/hashfunctions/#top
Bloom filter的实现以及常用的hash函数,布布扣,bubuko.com
时间: 2024-12-24 04:21:59