简介:利用哈夫曼树实现一个文本文档的压缩,以及对压缩文件的解压
思路:在压缩文件时,首先要统计字符出现的次数,构建哈夫曼树,生成哈夫曼编码,压缩到文件。
在解压文件时,读取压缩文件,将编码与字符相对应,最后将字符写到文件中。
在解压文件中,如何将编码与字符相对应?
我们都知道,在解压文件时,我们只有一个压缩文件,其余一慨不知。所以在解压时,需要重建哈夫曼树。要想重建哈夫曼树,就需要知道字符以及字符出现的次数。在压缩文件时,已经统计出字符出现的次数。所以,在压缩文件时,应该写配置文件。配置文件中存放字符以及字符出现的次数。在解压时,读取压缩文件,配置文件,重建哈夫曼树,将编码与字符相对应。
//建堆 #pragma once #include <iostream> #include <vector> using namespace std; template <class T> struct Less //小于 { bool operator()(const T& l,const T& r) { return l < r; } }; template <class T> struct Greater //大于 { bool operator()(const T& l,const T& r) { return l > r; } }; template <class T,class Comper = Greater<T> >//默认建大堆 class Heap { public: Heap() //无参构造函数 {} Heap(T* a,size_t size) { for(size_t i=0;i<size;++i) { _a.push_back(a[i]); } //建堆 for(int i=(_a.size()-2)/2;i>=0;--i) { _ApDown(i); } } void Push(const T& x)//插入元素 { _a.push_back(x);//在堆尾插入元素 _ApHeapUp(_a.size()-1); //向上调整 } void Pop()//删除(删除优先级高) { swap(_a[0],_a[_a.size()-1]);//交换堆的第一个元素和最后一个元素 _a.pop_back();//删除最后一个元素 _ApDown(0);//向下调整 } size_t Size()//堆的大小 { return _a.size(); } bool Empty()//堆是否为空 { return _a.empty(); } T Top() { return _a[0]; } public: void _ApDown(size_t parent) { size_t child = parent*2+1; while(child < _a.size()) { Comper com; //找到左右孩子中较大的 if((child+1) < _a.size() && com(_a[child+1],_a[child])) { ++child; } //比较较大孩子与父亲 if(com(_a[child],_a[parent])) { swap(_a[child],_a[parent]); parent = child; child = parent*2+1; } else { break; } } } void _ApHeapUp(size_t child) { size_t parent = (child-1)/2; Comper com; while(child > 0) { if(com(_a[child],_a[parent]))//比较孩子与父亲 { swap(_a[child],_a[parent]); child = parent; parent = (child-1)/2; } else { break; } } } protected: vector<T> _a; };
//建哈夫曼树 Huffman.h #include "Heap.h" template <class T> struct HuffmanTreeNode { HuffmanTreeNode(const T& x) :_left(NULL) ,_right(NULL) ,_weight(x) {} HuffmanTreeNode<T>* _left; HuffmanTreeNode<T>* _right; T _weight; }; template <class T> class HuffmanTree { typedef HuffmanTreeNode<T> Node; public: HuffmanTree(const T* a,size_t n,const T& invalue) { struct IsLess { bool operator()(const Node* left,const Node* right) { return left->_weight < right->_weight; } }; Heap<Node*,IsLess> minHeap; for(size_t i=0;i<n;++i) { if(a[i] != invalue) { minHeap.Push(new Node(a[i])); //建小堆 } } while(minHeap.Size() > 1) { Node* left = minHeap.Top(); minHeap.Pop(); Node* right = minHeap.Top(); minHeap.Pop(); Node* parent = new Node(left->_weight+right->_weight); parent->_left = left; parent->_right = right; minHeap.Push(parent); } _root = minHeap.Top(); } Node* GetRoot() { return _root; } protected: Node* _root; }; void HuffmanTreeTest() { int a[] = {1,2,3,4,5,6,7,8,9}; HuffmanTree<int> ht(a,sizeof(a)/sizeof(a[0]),‘#‘); }
//实现压缩,解压 FileCompare.h #define _CRT_SECURE_NO_WARNINGS #include "HuffmanTree.h" #include <assert.h> #include <string> #include <stdlib.h> typedef unsigned long LongType; struct CharInfo { unsigned char _ch; //字符 LongType _count; //字符出现的次数 string _code; //字符对应的Huffman编码 CharInfo() :_ch(0) ,_count(0) {} CharInfo(LongType count) :_ch(0) ,_count(count) {} bool operator!=(const CharInfo& info) const { return _count != info._count; } CharInfo operator+(const CharInfo& info) const { return CharInfo(_count + info._count); } bool operator<(const CharInfo& info) const { return _count < info._count; } }; class FileCompress { public: FileCompress() { for(size_t i=0;i<256;++i) { _info[i]._ch = i; _info[i]._count = 0; } } void GetHuffmanCode(HuffmanTreeNode<CharInfo>* root,string code)//获取哈夫曼编码 { if(root == NULL) return; if(root->_left == NULL && root->_right == NULL) { _info[root->_weight._ch]._code = code; } GetHuffmanCode(root->_left,code + ‘0‘);//左为0 GetHuffmanCode(root->_right,code + ‘1‘);//右为1 } bool ReadLine(FILE* fout,string& line) { char ch = fgetc(fout); if(feof(fout)) //若结束返回非零值 return false; while(!feof(fout) && ch != ‘\n‘) { line += ch; ch = fgetc(fout); } return true; } void Compress(const char* filename) { //统计字符的次数 FILE* fout = fopen(filename,"rb"); assert(fout); char ch = fgetc(fout); while(!feof(fout)) //读到文件尾的标志位 若采用ch != EOF 11111111 跳出读取文件 { _info[(unsigned char)ch]._count++; ch = fgetc(fout); } //构建Huffman树 CharInfo invalue; //非法值 HuffmanTree<CharInfo> tree(_info,256,invalue); //生成Huffman编码 string code; GetHuffmanCode(tree.GetRoot(),code); //压缩 string comFilename = filename; comFilename += ".compress"; FILE* fin = fopen(comFilename.c_str(),"wb"); assert(fin); fseek(fout,0,SEEK_SET); //设置文件指针的位置 ch = fgetc(fout); int size = 0; int value = 0; while(!feof(fout)) //feof 来判断文件是否执行结束,若结束,则返回非零值。 { string code = _info[(unsigned char)ch]._code; for(size_t i=0;i<code.size();++i) { if(code[i] == ‘1‘) { value |= 1; } ++size; if(size == 8) { fputc(value,fin); size = 0; value = 0; } value <<= 1; } ch = fgetc(fout); } if(size > 0) { value <<= (7-size); fputc(value,fin); } //配置文件 string configfile = filename; configfile += ".config"; FILE* fconfig = fopen(configfile.c_str(),"wb");//以二进制的形式打开 assert(fconfig); char buffer[256]; string line; for(size_t i=0;i<256;++i) { if(_info[i]._count > 0) { line += _info[i]._ch; line += ‘,‘; line += itoa(_info[i]._count,buffer,10); line += ‘\n‘; fputs(line.c_str(),fconfig); } line.clear(); } fclose(fout); fclose(fin); fclose(fconfig); } void Uncompress(const char* filename) { //读配置文件 string configfile = filename; configfile += ".config"; FILE* fconfig = fopen(configfile.c_str(),"rb");//以二进制的形式读取 assert(fconfig); string str; while(ReadLine(fconfig,str)) { if(str.empty()) //处理空行 { str += ‘\n‘; } else { _info[(unsigned char)str[0]]._count = atoi(str.substr(2).c_str());//第二个位置即第三个字符为字符的次数 str.clear(); } } //构建Huffman树 CharInfo invalue; HuffmanTree<CharInfo> tree(_info,256,invalue); //读取压缩文件,进行还原 string comFilename = filename; comFilename += ".compress"; FILE* fout = fopen(comFilename.c_str(),"rb"); assert(fout); HuffmanTreeNode<CharInfo>* root = tree.GetRoot(); HuffmanTreeNode<CharInfo>* cur = root; string uncomFilename = filename; uncomFilename += ".uncompress"; FILE* fin = fopen(uncomFilename.c_str(),"wb"); assert(fin); LongType SumCount = tree.GetRoot()->_weight._count; //总数 char ch = fgetc(fout); int pos = 7; while(1) { if(ch & (1<<pos)) { cur = cur->_right; } else { cur = cur->_left; } if(cur->_left == NULL && cur->_right == NULL) { fputc(cur->_weight._ch,fin); if(--SumCount == 0) { break; } cur = root; } if(pos-- == 0) { ch = fgetc(fout); pos = 7; } } fclose(fout); fclose(fin); } protected: CharInfo _info[256]; }; void PressHuffmanTest() { FileCompress fh; fh.Compress("input"); //fh.Compress("project.txt"); } void UnPressHuffmanTest() { FileCompress fh; fh.Uncompress("input"); //fh.Uncompress("project.txt"); }
//测试 #include "FileCompree.h" #include <windows.h> int main() { //HuffmanTreeTest(); //验证哈弗曼树 int begin1 = GetTickCount(); PressHuffmanTest(); int end1 = GetTickCount(); cout<<"压缩时间为:"<<end1-begin1<<endl; int begin2 = GetTickCount(); UnPressHuffmanTest(); int end2 = GetTickCount(); cout<<"解压时间为:"<<end2-begin2<<endl; return 0; }
测试结果:
比较结果:
时间: 2024-10-12 19:54:27