huffman编码压缩和解压

#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

typedef struct huffman_node_tag
{
    unsigned char        isLeaf;
    unsigned long        count;
    struct huffman_node_tag *parent;
    union
    {
       struct {
           struct huffman_node_tag *zero;
           struct huffman_node_tag *one;
       };
       unsigned char symbol;
    };
} huffman_node;

typedef struct huffman_code_tag
{
    unsigned long numbits;
    unsigned char *bits;
} huffman_code;

static unsigned long numbytes_from_numbits(unsigned long numbits)
{
    return numbits / 8 + (numbits % 8 ? 1 : 0);
}

static unsigned char get_bit(unsigned char *bits, unsigned long i)
{
    return ( bits[i/8] >> (i%8) ) & 1;
}

static void reverse_bits(unsigned char* bits, unsigned long numbits)
{
    unsigned long numbytes = numbytes_from_numbits(numbits);    // 所占字节数.
    unsigned char *tmp = (unsigned char*)calloc(numbytes, sizeof(unsigned char));    // 分配内存
    unsigned long curbit;           // index -- 当前位
    long          curbyte = 0;  // 当前是byte的index

    memset(tmp, 0, numbytes);   // 将tmp指向的buffer清零

    for(curbit=0; curbit<numbits; ++curbit)
    {
       unsigned int bitpos = curbit % 8;  // 当前byte里的index
       if(curbit>0 && curbit%8==0)
       {
           ++curbyte;
       }
       tmp[curbyte] |= (get_bit(bits, numbits-curbit-1) << bitpos);
    }
    memcpy(bits, tmp, numbytes);
}

static huffman_code* new_code(const huffman_node* leaf)
{
    unsigned long numbits = 0;
    unsigned char *bits = NULL;
    huffman_code  *p;

    while(leaf!=NULL && leaf->parent!=NULL)
    {
       huffman_node *parent = leaf->parent;
       unsigned long cur_byte   = numbits / 8;
       unsigned char cur_bit    = (unsigned char)(numbits % 8);

       if(cur_bit == 0)
       {
           size_t newSize = cur_byte + 1;
           bits = (unsigned char*)realloc(bits, newSize);
           bits[newSize - 1] = 0;
       }

       if(leaf == parent->one)
       {
           bits[cur_byte] |= (1<<cur_bit);
       }

       ++numbits;
       leaf = parent;
    }

    if( bits != 0)
    {
       reverse_bits(bits, numbits);
    }

    p = (huffman_code*)malloc(sizeof(huffman_code));
    p->numbits = numbits;
    p->bits    = bits; 

    return p;
}

static huffman_node* new_leaf_node(unsigned char symbol)
{
    huffman_node *p = (huffman_node*)malloc( sizeof(huffman_node) );

    p->isLeaf = 1;
    p->symbol = symbol;
    p->count = 0;
    p->parent = 0;

    return p;
}

static huffman_node* new_nonleaf_node(unsigned long count, huffman_node *zero, huffman_node *one)
{
    huffman_node *p = (huffman_node*)malloc( sizeof(huffman_node) );
    p->isLeaf = 0;
    p->count = count;
    p->zero = zero;
    p->one = one;
    p->parent = 0;
    return p;
}

static void free_huffman_tree(huffman_node *subtree)
{
    if(subtree == NULL)
       return;   

    if( !(subtree->isLeaf) )
    {
       free_huffman_tree( subtree->zero );
       free_huffman_tree( subtree->one );
    }
    free( subtree );
}

static void free_code(huffman_code* p)
{
    free(p->bits);
    free(p);
}

#define MAX_SYMBOLS 256
typedef huffman_node* SymbolFrequencies[MAX_SYMBOLS];   /*  */
typedef huffman_code* SymbolEncoder[MAX_SYMBOLS];       /*  */
/*  */
static void free_encoder(SymbolEncoder *pSE)
{
    unsigned long i;
    for(i = 0; i < MAX_SYMBOLS; ++i) {
       huffman_code *p = (*pSE)[i];
       if( p )    free_code(p);
    }
}
/*  */
static void
init_frequencies(SymbolFrequencies *pSF)
{
    memset(*pSF, 0, sizeof(SymbolFrequencies) );  /* 清零 */
}

// ----------------------------------------------------------------------------------------
typedef struct buf_cache_tag
{
    /*
     * 该结构主要描述了两个部分, 一个是cache, 一个是bufout.
     * cache是一个临时存储数据的buffer, cache会将数据写往bufout区间,
     * bufout类似一个仓库, 会一直存储cache写入的数据.
     * cache可以多次网bufout内写数据, bufout会一直保存这些数据.
     * bufout是一个动态的buffer, cache每一次往bufout内写数据的时候bufout都需要realloc一次.
     */
    unsigned char *cache;       // 指向真正存储数据的buffer
    unsigned int cache_len; // buffer的长度, 初始的时候就可以设置 cache的大小的
    unsigned int cache_cur; // 数据结尾处(或者说是光标位置)
    unsigned char **pbufout; /*
                             * cache要写数据就往这个空间内写(类似一个动态仓库, 一定是动态的)
                             * (*pbufout)就是真实的存储区
                             */
    unsigned int *pbufoutlen;  // 仓库的大小
} buf_cache;
/* 初始化一个buf_cache */
static int init_cache(buf_cache    *pc,
                    unsigned int    cache_size,
                    unsigned char **pbufout,
                    unsigned int    *pbufoutlen)
{
    assert(pc && pbufout && pbufoutlen);
    if(!pbufout || !pbufoutlen) return 1;

    pc->cache     = (unsigned char*)malloc(cache_size);  // 分配存储空间
    pc->cache_len = cache_size; //
    pc->cache_cur = 0;       // 光标从0开始
    pc->pbufout   = pbufout; //
    *pbufout      = NULL;       //
    pc->pbufoutlen    = pbufoutlen;
    *pbufoutlen   = 0;       //

    return (pc->cache==NULL ? 0 : 1);
}

/* 释放buf_cache */
static void free_cache(buf_cache* pc)
{
    assert( pc );
    if( pc->cache != NULL)
    {
       free( pc->cache );
       pc->cache = NULL;
    }
}

static int flush_cache(buf_cache* pc)
{
    assert( pc );

    if(pc->cache_cur > 0)
    {
       unsigned int newlen = pc->cache_cur + *(pc->pbufoutlen);
       unsigned char*    tmp = (unsigned char*)realloc(*(pc->pbufout), newlen);
       if( !tmp ) return 1;

       memcpy(tmp + *(pc->pbufoutlen), pc->cache, pc->cache_cur);
       *pc->pbufout = tmp;
       *pc->pbufoutlen = newlen;
       pc->cache_cur = 0;
    }

    return 0;
}

static int write_cache(buf_cache* pc,
                     const void *to_write,
                     unsigned int to_write_len)
{
    unsigned char* tmp;

    assert(pc && to_write);
    assert(pc->cache_len >= pc->cache_cur);

    if(to_write_len > pc->cache_len - pc->cache_cur)
    {
       unsigned int newlen;

       flush_cache( pc );

       newlen = *pc->pbufoutlen + to_write_len;
       tmp = (unsigned char*)realloc(*pc->pbufout, newlen);
       if( !tmp ) return 1;
       memcpy(tmp + *pc->pbufoutlen, to_write, to_write_len);
       *pc->pbufout = tmp;
       *pc->pbufoutlen = newlen;
    }
    else
    {
       memcpy(pc->cache+pc->cache_cur, to_write, to_write_len);
       pc->cache_cur += to_write_len;
    }

    return 0;
}

static unsigned int get_symbol_frequencies(SymbolFrequencies *pSF, FILE *in)
{
    int c;
    unsigned int total_count = 0;   // FILE对象内的字符总数

    init_frequencies( pSF ); /* Set all frequencies to 0. */

    /* Count the frequency of each symbol in the input file. */
    while( (c=fgetc(in)) != EOF )
    {
       unsigned char uc = c;

       if( !(*pSF)[uc] )
       {
           (*pSF)[uc] = new_leaf_node( uc );
       }
       ++( (*pSF)[uc]->count );
       ++total_count;
    }

    return total_count;
}
/* 计算buffer内各个字符的频率,和get_symbol_frequencies函数同理 */
static unsigned int get_symbol_frequencies_from_memory(SymbolFrequencies    *pSF,
                               const unsigned char    *bufin,
                               unsigned int       bufinlen)
{
    unsigned int i;
    unsigned int total_count = 0;

    /* Set all frequencies to 0. */
    init_frequencies(pSF);

    /* Count the frequency of each symbol in the input file. */
    for(i = 0; i < bufinlen; ++i)
    {
       unsigned char uc = bufin[i];
       if( !(*pSF)[uc] )
       {
           (*pSF)[uc] = new_leaf_node(uc);
       }
       ++(*pSF)[uc]->count;
       ++total_count;
    }

    return total_count;
}

static int SFComp(const void *p1, const void *p2)
{
    const huffman_node *hn1 = *(const huffman_node**)p1;
    const huffman_node *hn2 = *(const huffman_node**)p2;

    /* Sort all NULLs to the end. */
    if(hn1 == NULL && hn2 == NULL)     return 0;
    if(hn1 == NULL)                    return 1;
    if(hn2 == NULL)                    return -1;

    if(hn1->count > hn2->count)        return 1;
    else if(hn1->count < hn2->count)   return -1;

    return 0;
}

static void
build_symbol_encoder(huffman_node *subtree, SymbolEncoder *pSE)
{
    if(subtree == NULL)  return;

    if( subtree->isLeaf )
    {
       (*pSE)[subtree->symbol] = new_code( subtree );
    }
    else
    {
       build_symbol_encoder(subtree->zero, pSE);
       build_symbol_encoder(subtree->one, pSE);
    }
}

/*
 * calculate_huffman_codes turns pSF into an array
 * with a single entry that is the root of the
 * huffman tree. The return value is a SymbolEncoder,
 * which is an array of huffman codes index by symbol value.
 *
 * 为每个node编码. 这个函数比较重要, 精华就是在这个函数里头的for循环. 哈哈
 * 整个tree的建立全都依赖这个函数
 */
static SymbolEncoder*
calculate_huffman_codes(SymbolFrequencies * pSF)
{
    unsigned int i = 0;
    unsigned int n = 0;
    huffman_node *m1  = NULL, *m2 = NULL;
    SymbolEncoder *pSE = NULL;

    /*
     * Sort the symbol frequency array by ascending frequency.
     * 快速排序例程进行排序
     * 以symbol频率为关键字做升序排列
     * 有symbol的节点都会按升序排列, 没有symbol的节点会统一排在后面,
     * 通过一个for就能计算出symbol的个数了.
     */
    qsort((*pSF), MAX_SYMBOLS, sizeof((*pSF)[0]), SFComp);

    /*
     * Get the number of symbols.
     * 计算huffman树中的字符数, 这个实现可读性不够好
     */
    for(n = 0; (n<MAX_SYMBOLS) && (*pSF)[n]; ++n)
       ;

    /*
     * Construct a Huffman tree. This code is based
     * on the algorithm given in Managing Gigabytes
     * by Ian Witten et al, 2nd edition, page 34.
     * Note that this implementation uses a simple
     * count instead of probability.
     */
    for(i = 0; i < (n-1); ++i)
    {
       /* Set m1 and m2 to the two subsets of least probability. */
       m1 = (*pSF)[0];
       m2 = (*pSF)[1];
       /* Replace m1 and m2 with a set {m1, m2} whose probability
        * is the sum of that of m1 and m2.
        * 这个算法有优化的余地的, 因为n在一直减小.
        * 将最小的两个元素合并后得到一个一个节点为m12, 此时m1,m2已经建立起来了关系.
        * 这个m12的地址又被pSF[0]存储, 循环直至整个Tree建立成功.
        * 指针在这里运用的实在太巧妙了.
        * 这一行代码就是建树, 靠,NBA!
        */
       (*pSF)[0] = m1->parent = m2->parent = new_nonleaf_node(m1->count+m2->count, m1, m2);
       (*pSF)[1] = NULL;
       /*
        * Put newSet into the correct count position in pSF.
        * 这里应该可以再进行优化, 是否有必要再进行排序, 或者被排序的数组过长了.
        * 实际上每循环一次n都减少了一次
        */
       qsort((*pSF), n, sizeof((*pSF)[0]), SFComp);
    }/* for完毕的时候就求出了root, pSF[0]就是root, 后面的元素都是NULL
      * 而树通过for循环里头的
      * (*pSF)[0] = m1->parent = m2->parent = new_nonleaf_node(m1->count+m2->count, m1, m2);
      * 已经建立完成了*/

    /* Build the SymbolEncoder array from the tree. */
    pSE = (SymbolEncoder*)malloc(sizeof(SymbolEncoder));
    memset(pSE, 0, sizeof(SymbolEncoder));
    build_symbol_encoder((*pSF)[0], pSE);

    return pSE;
}

/*
 * Write the huffman code table. The format is:
 * 4 byte code count in network byte order.
 * 4 byte number of bytes encoded
 *   (if you decode the data, you should get this number of bytes)
 * code1
 * ...
 * codeN, where N is the count read at the begginning of the file.
 * Each codeI has the following format:
 * 1 byte symbol, 1 byte code bit length, code bytes.
 * Each entry has numbytes_from_numbits code bytes.
 * The last byte of each code may have extra bits, if the number of
 * bits in the code is not a multiple of 8.
 *
 * 编码后的格式 :
 * 0-3个byte是FILE内出现的不同字符个数(几不同的字符个数)
 * 4-7个byte是FILE内出现的全部字符个数(所有的字符)
 * 8-X是真正的编码后值
 *
 */
static int
write_code_table(FILE* out, SymbolEncoder *se, unsigned int symbol_count)
{
    unsigned long i, count = 0;

    /*
     * Determine the number of entries in se
     * 计算 SymbolEncoder 内具有编码值的元素个数.
     * 即有几种字符
     */
    for(i = 0; i < MAX_SYMBOLS; ++i)
       if( (*se)[i] )
           ++count;

    /*
     * Write the number of entries in network byte order.
     * 将字符种数写入到文件头部, 即[0, 3]一共4个字节
     */
    //i = htonl( count );
    i = count;
    if(fwrite(&i, sizeof(i), 1, out) != 1) return 1;

    /*
     * Write the number of bytes that will be encoded.
     * 将字符个数追加到[4,7]一共4个字节
     */
    //symbol_count = htonl(symbol_count);
    symbol_count = symbol_count;
    if(fwrite(&symbol_count, sizeof(symbol_count), 1, out) != 1)   return 1;

    /*
     * Write the entries.
     */
    for(i = 0; i < MAX_SYMBOLS; ++i)
    {
       huffman_code *p = (*se)[i];
       if( p != NULL )
       {   /*
            * 每个单元分为三个部分 :
            * symbol  -- 字符
            * numbits  -- 叶子走到root需要的步数
            * bits    -- 叶子走到root的方式(即最终的编码, 比如说0101)
            */
           unsigned int numbytes;
           /* Write the 1 byte symbol. */
           fputc((unsigned char)i, out);
           /* Write the 1 byte code bit length. */
           fputc(p->numbits, out);
           /* Write the code bytes. 她这个注释就没有说是几byte, 值得思考一下 */
           numbytes = numbytes_from_numbits( p->numbits );
           /* 将叶子走到root的方式写进去, 这个方式会被整理为byte格式, 不够就补0 */
           if(fwrite(p->bits, 1, numbytes, out) != numbytes)    return 1;
       }
    }

    return 0;
}

/*
 * Allocates memory and sets *pbufout to point to it. The memory
 * contains the code table.
 *
 * 以指定的格式将编码后的数据写入到cache中去, 实际是写到pbufout中去了.
 *
 */
static int
write_code_table_to_memory(buf_cache      *pc,
                        SymbolEncoder   *se,
                        unsigned int    symbol_count)
{
    unsigned long i, count = 0;

    /* Determine the number of entries in se. */
    for(i = 0; i < MAX_SYMBOLS; ++i)
    {
       if((*se)[i])
       {
           ++count;   // 计算不同字符的个数
       }
    }

    /* Write the number of entries in network byte order. */
    //i = htonl(count);
    i = count;
    if( write_cache(pc, &i, sizeof(i)) )   // 前四个字节是memory内所有字符数
       return 1;

    /* Write the number of bytes that will be encoded. */
    //symbol_count = htonl(symbol_count);
    symbol_count = symbol_count;
    if( write_cache(pc, &symbol_count, sizeof(symbol_count)) )  // 4-8字节是不同字符个数
       return 1;

    /* Write the entries. */
    for(i = 0; i < MAX_SYMBOLS; ++i)
    {
       huffman_code *p = (*se)[i];
       if( p )
       {
           /*
            * 对于每次循环来说, 如果p不为NULL, 则将该字符对应的编码写入到cache内.
            * 存储格式为三个字节作为一个单位.
            * byte0 --- 字符本身
            * byte1 --- 该字符编码后的码值长度(即2进制的位数)
            * byte2 --- 该字符对应的码值
            */
           unsigned int numbytes;
           /*
            * The value of i is < MAX_SYMBOLS (256), so it can
            * be stored in an unsigned char.
            * 将i转换为char型, 可以对应到字符集
            */
           unsigned char uc = (unsigned char)i;
           /*
            * Write the 1 byte symbol.
            * 将字符写到cache内
            */
           if(write_cache(pc, &uc, sizeof(uc)))   return 1;
           /*
            * Write the 1 byte code bit length.
            * 将叶子节点到root所需要经过的步数写到cache内, 也就是编码的长度
            * 这个数据是为了解码使用的.
            */
           uc = (unsigned char)p->numbits;
           if(write_cache(pc, &uc, sizeof(uc)))   return 1;
           /*
            * Write the code bytes.
            * 将编码值对齐并写如到cache内
            * 事先必须知道编码由几位组成, 如果编码为9位, 那么就需要2个byte来存储这个码值
            * 如果编码为4位, 那么就需要1个byte来存储了,
            */
           numbytes = numbytes_from_numbits(p->numbits);
           if(write_cache(pc, p->bits, numbytes)) return 1;
       }
    }

    return 0;
}

/*
 * read_code_table builds a Huffman tree from the code
 * in the in file. This function returns NULL on error.
 * The returned value should be freed with free_huffman_tree.
 *
 *
 */
static huffman_node*
read_code_table(FILE* in, unsigned int *pDataBytes)
{
    huffman_node *root = new_nonleaf_node(0, NULL, NULL);
    unsigned int count;

    /*
     * Read the number of entries.
     * (it is stored in network byte order).
     * 获得字符种数, 前2个byte就是出现的字符种数
     */
    if( fread(&count, sizeof(count), 1, in) != 1 )
    {
       free_huffman_tree( root );
       return NULL;
    }

    //count = ntohl(count);
    count = count;
    /*
     * Read the number of data bytes this encoding represents.
     * 一个有多少个字符
     */
    if( fread(pDataBytes, sizeof(*pDataBytes), 1, in) != 1 )
    {
       free_huffman_tree(root);
       return NULL;
    }

    //*pDataBytes = ntohl(*pDataBytes);
    *pDataBytes = *pDataBytes;

    /* Read the entries. */
    while(count-- > 0)
    {
       int           c;
       unsigned int curbit;
       unsigned char symbol;
       unsigned char numbits;
       unsigned char numbytes;
       unsigned char *bytes;
       huffman_node *p = root;

       if( (c=fgetc(in)) == EOF )
       {
           free_huffman_tree( root );
           return NULL;
       }
       symbol = (unsigned char)c;

       if( (c=fgetc(in)) == EOF )
       {
           free_huffman_tree( root );
           return NULL;
       }

       numbits    = (unsigned char)c;
       numbytes   = (unsigned char)numbytes_from_numbits( numbits );
       bytes      = (unsigned char*)malloc( numbytes );
       if( fread(bytes, 1, numbytes, in) != numbytes )
       {
           free(bytes);
           free_huffman_tree(root);
           return NULL;
       }

       /*
        * Add the entry to the Huffman tree. The value
        * of the current bit is used switch between
        * zero and one child nodes in the tree. New nodes
        * are added as needed in the tree.
        */
       for(curbit = 0; curbit < numbits; ++curbit)
       {
           if(get_bit(bytes, curbit))
           {
              if(p->one == NULL)
              {
                  p->one =
                     curbit == (unsigned char)(numbits-1) ?
                     new_leaf_node(symbol) : new_nonleaf_node(0, NULL, NULL);
                  p->one->parent = p;
              }
              p = p->one;
           }
           else
           {
              if(p->zero == NULL)
              {
                  p->zero =
                     curbit == (unsigned char)(numbits - 1) ?
                     new_leaf_node(symbol) : new_nonleaf_node(0, NULL, NULL);
                  p->zero->parent = p;
              }
              p = p->zero;
           }
       }

       free(bytes);
    }

    return root;
}
/*
 * 将数据从buf读到bufout中, 成功则返回0, 其他则返回1.
 * pindex  -- 拷贝的起点
 */
static int
memread(const unsigned char*    buf,
       unsigned int         buflen,
       unsigned int         *pindex,
       void*                bufout,
       unsigned int         readlen)
{
    assert(buf && pindex && bufout);
    assert(buflen >= *pindex);

    // 错误
    if(buflen < *pindex)        return 1;
    if(readlen + *pindex >= buflen) return 1;

    memcpy(bufout, buf + *pindex, readlen);
    *pindex += readlen;

    return 0;
}
/*
 * 从编码后的buf内读数据.
 */
static huffman_node*
read_code_table_from_memory(const unsigned char* bufin,
                         unsigned int         bufinlen,
                         unsigned int         *pindex,
                         unsigned int         *pDataBytes)
{
    huffman_node *root = new_nonleaf_node(0, NULL, NULL);
    unsigned int count;

    /*
     * Read the number of entries.
     * (it is stored in network byte order).
     * 读取
     */
    if( memread(bufin, bufinlen, pindex, &count, sizeof(count)) )
    {
       free_huffman_tree(root);
       return NULL;
    }

    //count = ntohl(count);
    count = count;

    /* Read the number of data bytes this encoding represents. */
    if(memread(bufin, bufinlen, pindex, pDataBytes, sizeof(*pDataBytes)))
    {
       free_huffman_tree(root);
       return NULL;
    }

    //*pDataBytes = ntohl(*pDataBytes);
    *pDataBytes = *pDataBytes;

    /* Read the entries. */
    while( (count--) > 0 )
    {
       unsigned int curbit;
       unsigned char symbol;
       unsigned char numbits;
       unsigned char numbytes;
       unsigned char *bytes;
       huffman_node *p = root;

       if(memread(bufin, bufinlen, pindex, &symbol, sizeof(symbol)))
       {
           free_huffman_tree(root);
           return NULL;
       }

       if(memread(bufin, bufinlen, pindex, &numbits, sizeof(numbits)))
       {
           free_huffman_tree(root);
           return NULL;
       }

       numbytes = (unsigned char)numbytes_from_numbits(numbits);
       bytes = (unsigned char*)malloc(numbytes);
       if(memread(bufin, bufinlen, pindex, bytes, numbytes))
       {
           free(bytes);
           free_huffman_tree(root);
           return NULL;
       }

       /*
        * Add the entry to the Huffman tree. The value
        * of the current bit is used switch between
        * zero and one child nodes in the tree. New nodes
        * are added as needed in the tree.
        */
       for(curbit = 0; curbit < numbits; ++curbit)
       {
           if(get_bit(bytes, curbit))
           {
              if(p->one == NULL)
              {
                  p->one = ( curbit==(unsigned char)(numbits - 1) ) ?
                  new_leaf_node(symbol) : new_nonleaf_node(0, NULL, NULL);
                  p->one->parent = p;
              }
              p = p->one;
           }
           else
           {
              if(p->zero == NULL)
              {
                  p->zero = curbit == (unsigned char)(numbits - 1)
                     ? new_leaf_node(symbol)
                     : new_nonleaf_node(0, NULL, NULL);
                  p->zero->parent = p;
              }
              p = p->zero;
           }
       }

       free(bytes);
    }

    return root;
}
/*
 * 依次将各个字符的编码写入到out中, 这次是直接写, 不对编码进行整齐工作
 * 也就是不将编码强制为byte类型了, 而是直接写入到out中.
 */
static int
do_file_encode(FILE* in, FILE* out, SymbolEncoder *se)
{
    unsigned char curbyte = 0;
    unsigned char curbit  = 0;
    int c;

    while( (c = fgetc(in)) != EOF)
    {
       unsigned char uc     = (unsigned char)c;
       huffman_code *code = (*se)[uc];
       unsigned long i;

       for(i = 0; i < code->numbits; ++i)
       {
           curbyte |= get_bit(code->bits, i) << curbit;
           if(++curbit == 8)
           {
              fputc(curbyte, out);
              curbyte = 0;
              curbit = 0;
           }
       }
    }

    if(curbit > 0)
       fputc(curbyte, out);

    return 0;
}

static int do_memory_encode(buf_cache *pc, const unsigned char *bufin,
               unsigned int         bufinlen,
               SymbolEncoder           *se)
{
    unsigned char curbyte    = 0;       //
    unsigned char curbit = 0;
    unsigned int i;

    /* 对 bufin 内的字符依次循环 */
    for(i = 0; i < bufinlen; ++i)
    {
       unsigned char uc = bufin[i];
       huffman_code *code = (*se)[uc]; // 取出第i个字符的编码
       unsigned long i;           

       /* 对第i个字符编码长度进行循环 */
       for(i = 0; i < code->numbits; ++i)
       {
           /*
            * Add the current bit to curbyte.
            * 依次取出
            */
           curbyte |= ( get_bit(code->bits, i) << curbit );

           /*
            * If this byte is filled up then write it
            * out and reset the curbit and curbyte
            */
           if(++curbit == 8)
           {
              /*
               * 满了一个字节则写cache
               *
               */
              if(write_cache(pc, &curbyte, sizeof(curbyte)))   return 1;
              curbyte = 0;
              curbit  = 0;
           }
       }
    }

    return curbit > 0 ? write_cache(pc, &curbyte, sizeof(curbyte)) : 0;
}

/*
 * huffman_encode_file huffman encodes in to out.
 * 对FILE对象进行编码, 将*in编码后写入*out.
 */
int
huffman_encode_file(FILE *in, FILE *out)
{
    SymbolFrequencies sf;
    SymbolEncoder *se;
    huffman_node *root = NULL;
    int rc;
    unsigned int symbol_count;

    symbol_count = get_symbol_frequencies(&sf, in);

    se = calculate_huffman_codes( &sf );
    root = sf[0];

    rewind( in ); // 将文件指针重新指向一个流的开头

    rc = write_code_table(out, se, symbol_count);
    if(rc == 0)
       rc = do_file_encode(in, out, se);

    free_huffman_tree( root );
    free_encoder(se);

    return rc;
}

int huffman_decode_file(FILE *in, FILE *out)
{
    huffman_node *root;
    huffman_node *p;
    int           c;
    unsigned int data_count;

    /* Read the Huffman code table. */
    root = read_code_table(in, &data_count);
    if( !root )   return 1;

    /* Decode the file. */
    p = root;
    while(data_count>0 && (c=fgetc(in))!=EOF)
    {
       unsigned char byte = (unsigned char)c;
       unsigned char mask = 1;
       while(data_count > 0 && mask)
       {
           p = ( (byte&mask)? p->one : p->zero );
           mask <<= 1;

           if( p->isLeaf )
           {
              fputc(p->symbol, out);
              p = root;
              --data_count;
           }
       }
    }

    free_huffman_tree( root );

    return 0;
}

// --------------------------------------------------------------------------------------
#define CACHE_SIZE 1024   

int huffman_encode_memory(const unsigned char    *bufin,
                       unsigned int           bufinlen,
                       unsigned char       **pbufout,
                       unsigned int           *pbufoutlen)
{
    SymbolFrequencies    sf;
    SymbolEncoder     *se;

    huffman_node *root = NULL;
    int rc;
    unsigned int symbol_count;  // memory中的字符个数

    buf_cache cache;         //

    /* Ensure the arguments are valid. 检测参数合法性 */
    if(!pbufout || !pbufoutlen) return 1;
    if( init_cache(&cache, CACHE_SIZE, pbufout, pbufoutlen) )   return 1;

    /*
     * Get the frequency of each symbol in the input memory
     * 计算bufin内各个字符出现的频率, 并求得bufin内存储的字符个数.
     */
    symbol_count = get_symbol_frequencies_from_memory(&sf, bufin, bufinlen);

    /*
     * Build an optimal table from the symbolCount.
     * 为每个Node编码, 如果这个Node的symbol为NULL, 则不编码了.
     */
    se = calculate_huffman_codes( &sf );
    root = sf[0]; // root来拉, 哈哈, 逻辑树出来了.

    /*
     * Scan the memory again and, using the table
     * previously built, encode it into the output memory.
     * 将se内的数据统统的写入到cache中克.
     */
    rc = write_code_table_to_memory(&cache, se, symbol_count);
    if(rc == 0)
    {
       /*
        * 为什么write_code_table_to_memory成功之后还要要执行一次do_memory_encode?
        *
        */
       rc = do_memory_encode(&cache, bufin, bufinlen, se);
    }

    /* Flush the cache. */
    flush_cache( &cache );

    /* Free the Huffman tree. */
    free_huffman_tree( root );
    free_encoder( se );
    free_cache( &cache );

    return rc;
}

/**
 * 对bufin进行解码. 将解码后的数据写入到bufout中.
 */
int huffman_decode_memory(const unsigned char    *bufin,
                       unsigned int           bufinlen,
                       unsigned char       **pbufout,
                       unsigned int           *pbufoutlen)
{
    huffman_node *root, *p;
    unsigned int data_count;
    unsigned int i = 0;
    unsigned char *buf;
    unsigned int bufcur = 0;

    /* Ensure the arguments are valid. */
    if(!pbufout || !pbufoutlen) return 1;

    /* Read the Huffman code table. */
    root = read_code_table_from_memory(bufin, bufinlen, &i, &data_count);
    if(!root)  return 1;

    buf = (unsigned char*)malloc(data_count);

    /* Decode the memory. */
    p = root;
    for(; i < bufinlen && data_count > 0; ++i)
    {
       unsigned char byte = bufin[i];
       unsigned char mask = 1;
       while(data_count > 0 && mask)
       {
           p = byte & mask ? p->one : p->zero;
           mask <<= 1;

           if(p->isLeaf)
           {
              buf[bufcur++] = p->symbol;
              p = root;
              --data_count;
           }
       }
    }

    free_huffman_tree(root);
    *pbufout = buf;
    *pbufoutlen = bufcur;

    return 0;
}

/*
--------------------------------------------------------------------------------

不妨假设待编码的buffer为 "ABCAADC"
手工分析可得该树的形状  :
当然也可以也可以将这个树沿y方向翻转180度
               root
                /               /                A    *
                  / \
                 /                   C     *
                    /                     /                      B      D

现在我们知道的两个事实是 :
这个buffer内的字符数为         : symbol_count    = 7  ( "ABCAADC"一个有7个字符 )
这个buffer内出现的字符种数为    : count       = 4 ( 只出现了ABCD四种字符 )

接下来人工分析各个字符 :
symbol |   count  |       bits
--------|-----------|---------------------
  A     |     3     |   0000 0000
  B     |     1     |   0000 0110
  C     |     2     |   0000 0010
  D     |     1     |    0000 0111
我们设置左边为0, 右边为1. bits为从叶子节点走到root的路径.

分析完毕后, 需要实现整个编码过程. 编码过程暂时跳过.
假设成功编码完毕, 需要把编码后的数据写入到bufout内.

bufout内的
0-3个byte为字符种数 count
4-7个byte为字符个数 symbol_count

然后是遍历SymbolEncoder, 依次对每种字符进行编码(我们这个例子只进行4次编码)
我们对每种字符都会进行编码, 每个字符编码后的输出不妨称为frame
那么这个frame是由三个部分组成的:

(这个我们可以肯定一个char肯定是1byte的)
symbol (1byte)    -- 字符

(这个我们可以肯定就算这个树根本没有分支, 永远只有左/右孩子, 那也了不起是是256的深度)
numbits (1byte)   -- 叶子走到root需要的步数

bits   (1byte)    -- 叶子走到root的方式(即最终的编码, 比如说011)
开始我对这个bites到底占了多少个byte很是怀疑, 因为我不知道从叶子走到root
到底耗费了几步. 这里需要好好研究一下, 最好和最差情况. 暂时假设是个变化的byte吧.
但是有一点bites跟numbits是有关系的, 所以只要知道numbits还是可以知道bites占据了
多少byte的, 也知道bits到底是有几位.
    byte       content
---------------------------------------------
    0-3(4)     count
    4-7(4)     symbol_count

    A占xa个byte frame_struct
    B占xb个byte frame_struct
    C占xc个byte frame_struct
    D占xd个byte frame_struct

       X          X      

这个X是do_file_encode函数写到bufout中去的数据, 那么这个数据是什么呢?
实际上它是循环的把出现的字符的bits写到bufout中,
根据这个数据,解码的时候就可以依次的找到第0,1,2...个位置出现的是什么字符了
--------------------------------------------------------------------------------
*/
int
main(int argc, char** argv)
{
    FILE *origin  = fopen("main.cpp", "r+");
    FILE *out     = fopen("temp.txt", "w+");
    huffman_encode_file(origin, out);

    FILE *outfile = fopen("temp.txt", "r+");
    FILE *result  = fopen("out.cpp", "w+");
    huffman_decode_file(outfile, result);

    system("pause");
    return 1;
}
时间: 2024-10-01 20:03:47

huffman编码压缩和解压的相关文章

ruby利用Zip Gem写一个简单的压缩和解压的小工具

在UNIX下的我们怎么会沦落到用ruby写压缩和解压工具呢?直接上shell啊!但是请允许本猫这次可耻的用ruby来玩玩吧!其实ruby GEM中有很多压缩解压包,我选的是Zip,也许是因为名字符合KISS原则吧!不过在编写中发现Zip中的某些类没有文档中所说明的实例方法,也许在某个平台上还未实现?? 话先说到前头,这个工具如果解压有重名文件的情况会直接覆盖原文件而不会有任何提示!测试时务必注意,如果造成一些文件丢失可别怪本猫啊! 代码也考虑到多文件的情况,如果是压缩多文件则默认会单独压缩每一个

iOS开发 -文件下载(6压缩和解压)

iOS开发网络篇—文件下载(六·压缩和解压) 一.完成文件下载 需求:完成文件下载 1.在本地服务器中,添加一个图片的压缩文件. 2.代码示例: 文件下载器代码: 头文件 1 // 2 // YYfileDownloader.h 3 // 01-文件的下载(不合理) 4 // 5 // Created by apple on 14-7-1. 6 // Copyright (c) 2014年 itcase. All rights reserved. 7 // 8 9 #import <Founda

linux压缩和解压,socket编程

1.使用zip和unzip压缩和解压 Zip aa.zip  文件名(一个) Zip aa.zip 文件名1 文件名2(压缩多个文件) Zip -r aa.zip  文件夹路径(压缩整个文件夹) zip -t 102002 file.zip 压缩当前目录下在2002 10月20日之后的文件压缩 zip file.zip * -x file2.txt 压缩时,将当前目录内的file2.txt文件排除在外 2.解压 unzip aa.zip unzip file.zip x file2 :除了fil

iOS开发网络篇—文件下载(六&#183;压缩和解压)

iOS开发网络篇—文件下载(六·压缩和解压) 一.完成文件下载 需求:完成文件下载 1.在本地服务器中,添加一个图片的压缩文件. 2.代码示例: 文件下载器代码: 头文件 1 // 2 // YYfileDownloader.h 3 // 01-文件的下载(不合理) 4 // 5 // Created by apple on 14-7-1. 6 // Copyright (c) 2014年 itcase. All rights reserved. 7 // 8 9 #import <Founda

Xceed Zip压缩和解压控件Xceed Zip Compression Library

Xceed Zip Compression Library 是一个高性能的 Zip 和 Unzip 数据压缩ActiveX控件.通过它,可以创建和操作与Zip文件,也能在内存中直接压缩/解压数据.它设计提供高度灵活性,并且使用快速的多线程 zip 压缩引擎. 具体功能: ActiveX 技术 ATL 3.0编写,简单且独立的 COM 对象和 ActiveX 控件. 无须外部的压缩动态链接库, MFC DLL 或运行库等. 同时有单线程 (STA) 和多线程 (MTA) 模型设计. 不必将组件置于

C#对文件操作(基本的读写以及压缩和解压)

主要是针对单个文件进行读写操作和压缩操作:用到的主要C#类有FileStream.FileInfo.StreamWrite.StreamRead.GZipStream. 字符数组和字节数组的转换: 1 byte[] bytedata = new byte[200]; 2 char[] chardata = new char[200]; 3 try 4 { 5 FileStream fs = new FileStream("App.config", FileMode.Open); 6 f

Linux下的压缩和解压

1. gzip, bzip2 能否直接压缩目录呢? 不可以 2. 请快速写出,使用gzip和bzip2压缩和解压一个文件的命令.压缩:gzip 1.txt bzip2 1.txt解压:gzip -d 1.txt.gz bzip2 -d 1.txt.bz2 3. tar 在打包的时候,如果想排除多个文件或者目录如何操作?--exclude filename 4. 请实验,如果不加 "-" 是否正确, 如 tar zcvf  1.tar.gz  1.txt 2.txt ?正确,可以执行命令

Linux tar(用来压缩和解压文件)

通过SSH访问服务器,难免会要用到压缩,解压缩,打包,解包等,这时候tar命令就是是必不可少的一个功能强大的工具.linux中最流行的tar是麻雀虽小,五脏俱全,功能强大. tar命令可以为linux的文件和目录创建档案.利用tar,可以为某一特定文件创建档案(备份文件),也可以在档案中改变文件,或者向档案中加入新的文件.tar最初被用来在磁带上创建档案,现在,用户可以在任何设备上创建档案.利用tar命令,可以把一大堆的文件和目录全部打包成一个文件,这对于备份文件或将几个文件组合成为一个文件以便

linux下文件加密压缩和解压的方法

一.用tar命令 对文件加密压缩和解压 压缩:tar -zcf  - filename |openssl des3 -salt -k password | dd of=filename.des3 此命令对filename文件进行加码压缩 生成filename.des3加密压缩文件, password 为加密的密码. 解压:dd if=filename.des3 |openssl des3 -d -k password | tar zxf - 注意命令最后面的“-”  它将释放所有文件, -k p