DeepLearning to digit recongnizer in kaggle

最近在看deeplearning，于是就找了kaggle上字符识别进行练习。这里我主要用两种工具箱进行求解，并比对两者的结果。两种工具箱分别是DeepLearningToolbox和caffe。DeeplearningToolbox源码解析见：http://blog.csdn.net/lu597203933/article/details/46576017

Caffe学习见：http://caffe.berkeleyvision.org/

一：DeeplearningToolbox

DeeplearningToolbox基于matlab，非常的简单，读下源码，对于了解卷积神经网络等过程非常有帮助。这里我主要是对digit recongnizer给出的数据集进行预处理以使其适用于我们的deeplearningToolbox工具箱。主要包含两个.m文件，分别是predeal.m和cnntest.m文件。所需要做的就是改变addpath的路径，代码注释很详细，大家自己看。

代码

predeal.m

% use the deeplearnToolbox to solve the digit recongnizer in kaggle!
clear;clc
trainFile = 'train.csv';
testFile = 'test.csv';
fidId = fopen(trainFile);

M = csvread(trainFile, 1);   % 读取csv文件除第一行以外的所有数据
train_x = M(:, 2:end);    %第2列开始为数据data
label = M(:,1)';  %第一列为标签
label(label == 0) = 10;   % 不变为10 下面一句无法处理
train_y = full(sparse(label, 1:size(train_x, 1), 1));   %将标签变成一个矩阵

train_x = double(reshape(train_x',28,28,size(train_x, 1)))/255;  

fidId = fopen('test.csv');     %% 处理预测的数据
M = csvread(testFile, 1);   % 读取csv文件除第一行以外的所有数据
test_x = double(reshape(M',28,28,size(M, 1)))/255;
clear fidId label testFile M testFile trainFile

addpath D:\DeepLearning\DeepLearnToolbox-master\data\      %路径需要改下
addpath D:\DeepLearning\DeepLearnToolbox-master\CNNaddpath D:\DeepLearning\DeepLearnToolbox-master\util
rand('state',0)
cnn.layers = {        %%% 设置各层feature maps个数及卷积模板大小等属性
    struct('type', 'i') %input layer
    struct('type', 'c', 'outputmaps', 6, 'kernelsize', 5) %convolution layer
    struct('type', 's', 'scale', 2) %sub sampling layer
    struct('type', 'c', 'outputmaps', 12, 'kernelsize', 5) %convolution layer
    struct('type', 's', 'scale', 2) %subsampling layer
};

opts.alpha = 0.01;   %迭代下降的速率
opts.batchsize = 50;   %每次选择50个样本进行更新  随机梯度下降，每次只选用50个样本进行更新
opts.numepochs = 25;   %迭代次数
cnn = cnnsetup(cnn, train_x, train_y);      %对各层参数进行初始化 包括权重和偏置
cnn = cnntrain(cnn, train_x, train_y, opts);  %训练的过程，包括bp算法及迭代过程

test_y = cnntest(cnn, test_x);      %对测试数据集进行测试
test_y(test_y == 10) = 0;      %标签10 需要反转为0
test_y = test_y';
M = [(1:length(test_y))' test_y(:)];
csvwrite('test_y.csv', M);
figure; plot(cnn.rL);

cnntest.m

  function [test_y] = cnntest(net, x)
    %  feedforward
    net = cnnff(net, x);
    [~, test_y] = max(net.o);
end

结果：用deeplearningToolbox得到的结果并不是很好，只有0.94586

二：caffe to digit recongnizer

虽然caffe自带了mnist对例子对字符进行处理，但是官网给出的数据是二进制的文件，得到的结果也只是一个简单的准确率，所以不能无限制的套用。步骤如下：

1：将给定csv数据转变成lmdb格式

这里我在mnist的文件夹下写了一个convert_data_to_lmdb.cpp的程序对数据进行处理：

代码如下：

#include <iostream>
#include <string>
#include <sstream>
#include <gflags/gflags.h>

#include "boost/scoped_ptr.hpp"
#include "gflags/gflags.h"
#include "glog/logging.h"

#include "caffe/proto/caffe.pb.h"
#include "caffe/util/db.hpp"
#include "caffe/util/io.hpp"
#include "caffe/util/rng.hpp"

using namespace caffe;
using namespace std;
using std::pair;
using boost::scoped_ptr;

/* edited by Zack
 * argv[1] the input file, argv[2] the output file*/

DEFINE_string(backend, "lmdb", "The backend for storing the result");  // get Flags_backend == lmdb

int main(int argc, char **argv){
	::google::InitGoogleLogging(argv[0]);

	#ifndef GFLAGS_GFLAGS_H_
	  namespace gflags = google;
	#endif

	if(argc < 3){
		LOG(ERROR)<< "please check the input arguments!";
		return 1;
	}
	ifstream infile(argv[1]);
	if(!infile){
		LOG(ERROR)<< "please check the input arguments!";
		return 1;
	}
	string str;
	int count = 0;
	int rows = 28;
	int cols = 28;
	unsigned char *buffer = new  unsigned char[rows*cols];
	stringstream ss;

	Datum datum;             // this data structure store the data and label
	datum.set_channels(1);    // the channels
	datum.set_height(rows);    // rows
	datum.set_width(cols);     // cols

	scoped_ptr<db::DB> db(db::GetDB(FLAGS_backend));         // new DB object
	db->Open(argv[2], db::NEW);                    // open the lmdb file to store the data
	scoped_ptr<db::Transaction> txn(db->NewTransaction());   // new Transaction object to put and commit the data

	const int kMaxKeyLength = 256;           // to save the key
	char key_cstr[kMaxKeyLength];

	bool flag= false;
	while(getline(infile, str)){
		if(flag == false){
			flag = true;
			continue;
		}
		int beg = 0;
		int end = 0;
		int str_index = 0;
		//test  need to add this----------1
		//datum.set_label(0);
		while((end = str.find_first_of(',', beg)) != string::npos){
			//cout << end << endl;
			string dig_str = str.substr(beg, end - beg);
			int pixes;
			ss.clear();
			ss << dig_str;
			ss >> pixes;
			// test need to delete this--------------2
			if(beg == 0){
				datum.set_label(pixes);
				beg = ++ end;
				continue;
			}
			buffer[str_index++] = (unsigned char)pixes;
			beg = ++end;
		}
		string dig_str = str.substr(beg);
		int pixes;
		ss.clear();
		ss << dig_str;
		ss >> pixes;
		buffer[str_index++] = (unsigned char)pixes;
		datum.set_data(buffer, rows*cols);

		int length = snprintf(key_cstr, kMaxKeyLength, "%08d", count);

		    // Put in db
		string out;
		CHECK(datum.SerializeToString(&out));              // serialize to string
		txn->Put(string(key_cstr, length), out);        // put it, both the key and value

		if (++count % 1000 == 0) {       // to commit every 1000 iteration
		  // Commit db
		  txn->Commit();
		  txn.reset(db->NewTransaction());
		  LOG(ERROR) << "Processed " << count << " files.";
		}

	}
	// write the last batch
	  if (count % 1000 != 0) {            // commit the last batch
		txn->Commit();
		LOG(ERROR) << "Processed " << count << " files.";
	  }

	return 0;
}

然后我们执行make all –j8对代码进行编译。这样在build文件夹下就会生成相应的二进制文件了。

如图：

然后执行./build/examples/mnist/convert_data_to_lmdb.bin examples/mnist/kaggle/data/train.csvexamples/mnist/kaggle/mnist_train_lmdb --backend=lmdb

就可以得到得到训练文件的lmdb格式文件了。对于测试test.csv，由于test.csv没有标签，所以需要对代码进行细微调整，2处调整已在上述代码中标注了。

然后同样执行make all –j8，再执行./build/examples/mnist/convert_data_to_lmdb.bin examples/mnist/kaggle/data/test.csvexamples/mnist/kaggle/mnist_test_lmdb --backend=lmdb

就可以得到所对应的测试数据的lmdb格式文件了。

2：用训练数据进行训练得到model

Caffe在训练model的时候，代码需要在每隔test_iter时间就要对测试数据集进行测试，因此我们这里可以用train.csv的前1000条数据制作一个交叉验证的数据集lmdb, 过程和上面一样。

分别将mnist目录下面的lenet_solver.prototxt和lenet_train_test.prototxt拷贝到kaggle目录下面，并对相应的包含文件所在目录和对应的batch size进行修改。具体见：下载地址。

然后执行./build/tools/caffe train –solver=examples/mnist/kaggle/lenet_solver.prototxt，这样就可以得到我们的lenet_iter_10000.caffemodel了。

3：提取测试集prob层的特征。

这里我们使用tools文件下的extract_features.cpp的源文件，但是该源文件产生的结果是lmdb的格式，因此我对源码进行了修改如下：

#include <stdio.h>  // for snprintf
#include <string>
#include <vector>
#include <fstream>

#include "boost/algorithm/string.hpp"
#include "google/protobuf/text_format.h"

#include "caffe/blob.hpp"
#include "caffe/common.hpp"
#include "caffe/net.hpp"
#include "caffe/proto/caffe.pb.h"
#include "caffe/util/db.hpp"
#include "caffe/util/io.hpp"
#include "caffe/vision_layers.hpp"

using caffe::Blob;
using caffe::Caffe;
using caffe::Datum;
using caffe::Net;
using boost::shared_ptr;
using std::string;
namespace db = caffe::db;

template<typename Dtype>
int feature_extraction_pipeline(int argc, char** argv);

int main(int argc, char** argv) {
  return feature_extraction_pipeline<float>(argc, argv);
//  return feature_extraction_pipeline<double>(argc, argv);
}

template<typename Dtype>
int feature_extraction_pipeline(int argc, char** argv) {
  ::google::InitGoogleLogging(argv[0]);
  const int num_required_args = 7;     /// the parameters must be not less 7
  if (argc < num_required_args) {
    LOG(ERROR)<<
    "This program takes in a trained network and an input data layer, and then"
    " extract features of the input data produced by the net.\n"
    "Usage: extract_features  pretrained_net_param"
    "  feature_extraction_proto_file  extract_feature_blob_name1[,name2,...]"
    "  save_feature_dataset_name1[,name2,...]  num_mini_batches  db_type"
    "  [CPU/GPU] [DEVICE_ID=0]\n"
    "Note: you can extract multiple features in one pass by specifying"
    " multiple feature blob names and dataset names seperated by ','."
    " The names cannot contain white space characters and the number of blobs"
    " and datasets must be equal.";
    return 1;
  }
  int arg_pos = num_required_args;     //the necessary nums of parameters

  arg_pos = num_required_args;
  if (argc > arg_pos && strcmp(argv[arg_pos], "GPU") == 0) {          // whether use GPU------ -gpu 0
    LOG(ERROR)<< "Using GPU";
    uint device_id = 0;
    if (argc > arg_pos + 1) {
      device_id = atoi(argv[arg_pos + 1]);
      CHECK_GE(device_id, 0);
    }
    LOG(ERROR) << "Using Device_id=" << device_id;
    Caffe::SetDevice(device_id);
    Caffe::set_mode(Caffe::GPU);
  } else {
    LOG(ERROR) << "Using CPU";
    Caffe::set_mode(Caffe::CPU);
  }

  arg_pos = 0;  // the name of the executable
  std::string pretrained_binary_proto(argv[++arg_pos]);      // the mode had been trained

  // Expected prototxt contains at least one data layer such as
  //  the layer data_layer_name and one feature blob such as the
  //  fc7 top blob to extract features.
  /*
   layers {
     name: "data_layer_name"
     type: DATA
     data_param {
       source: "/path/to/your/images/to/extract/feature/images_leveldb"
       mean_file: "/path/to/your/image_mean.binaryproto"
       batch_size: 128
       crop_size: 227
       mirror: false
     }
     top: "data_blob_name"
     top: "label_blob_name"
   }
   layers {
     name: "drop7"
     type: DROPOUT
     dropout_param {
       dropout_ratio: 0.5
     }
     bottom: "fc7"
     top: "fc7"
   }
   */
  std::string feature_extraction_proto(argv[++arg_pos]);    // get the net structure
  shared_ptr<Net<Dtype> > feature_extraction_net(
      new Net<Dtype>(feature_extraction_proto, caffe::TEST));               //new net object  and set each layers------feature_extraction_net
  feature_extraction_net->CopyTrainedLayersFrom(pretrained_binary_proto);           // init the weights

  std::string extract_feature_blob_names(argv[++arg_pos]);          //exact which blob's feature
  std::vector<std::string> blob_names;
  boost::split(blob_names, extract_feature_blob_names, boost::is_any_of(","));   //you can exact many blobs' features and to store them in different dirname

  std::string save_feature_dataset_names(argv[++arg_pos]);   // to store the features
  std::vector<std::string> dataset_names;
  boost::split(dataset_names, save_feature_dataset_names,         // each dataset_names to store one blob's feature
               boost::is_any_of(","));
  CHECK_EQ(blob_names.size(), dataset_names.size()) <<
      " the number of blob names and dataset names must be equal";
  size_t num_features = blob_names.size();     // how many features you exact

  for (size_t i = 0; i < num_features; i++) {
    CHECK(feature_extraction_net->has_blob(blob_names[i]))
        << "Unknown feature blob name " << blob_names[i]
        << " in the network " << feature_extraction_proto;
  }

  int num_mini_batches = atoi(argv[++arg_pos]);            // each exact num_mini_batches of images

  // init the DB and Transaction for all blobs you want to extract features
  std::vector<shared_ptr<db::DB> > feature_dbs;               // new DB object, is a vector  maybe has many blogs' feature
  std::vector<shared_ptr<db::Transaction> > txns;            // new Transaction object, is a vectore maybe has many blob's feature

  // edit by Zack
   //std::string strfile = "/home/hadoop/caffe/textileImage/features/probTest";
  std::string strfile = argv[argc-1];
  std::vector<std::ofstream*> vec(num_features, 0);

  const char* db_type = argv[++arg_pos];                  //the data to store style == lmdb
  for (size_t i = 0; i < num_features; ++i) {
    LOG(INFO)<< "Opening dataset " << dataset_names[i];               // dataset_name[i] to store the feature which type is lmdb
    shared_ptr<db::DB> db(db::GetDB(db_type));             // the type of the db
    db->Open(dataset_names.at(i), db::NEW);          // open the dir to store the feature
    feature_dbs.push_back(db);             // put the db to the vector
    shared_ptr<db::Transaction> txn(db->NewTransaction());     // the transaction to the db
    txns.push_back(txn);                // put the transaction to the vector

// edit by Zack

    std::stringstream ss;
    ss.clear();
    string index;
    ss << i;
    ss >> index;
    std::string str = strfile + index + ".txt";
    vec[i] = new std::ofstream(str.c_str());
  }

  LOG(ERROR)<< "Extacting Features";

  Datum datum;
  const int kMaxKeyStrLength = 100;
  char key_str[kMaxKeyStrLength];      // to store the key
  std::vector<Blob<float>*> input_vec;
  std::vector<int> image_indices(num_features, 0);   /// how many blogs' feature you exact

  for (int batch_index = 0; batch_index < num_mini_batches; ++batch_index) {
    feature_extraction_net->Forward(input_vec);
    for (int i = 0; i < num_features; ++i) {    // to exact the blobs' name  maybe fc7 fc8
      const shared_ptr<Blob<Dtype> > feature_blob = feature_extraction_net
          ->blob_by_name(blob_names[i]);
      int batch_size = feature_blob->num();     // the nums of images-------batch size
      int dim_features = feature_blob->count() / batch_size;    // this dim of this feature of each image in this blob
      const Dtype* feature_blob_data;   // float is the features
      for (int n = 0; n < batch_size; ++n) {
        datum.set_height(feature_blob->height());     // set the height
        datum.set_width(feature_blob->width());     // set the width
        datum.set_channels(feature_blob->channels());    // set the channel
        datum.clear_data();               // clear data
        datum.clear_float_data();        // clear float_data
        feature_blob_data = feature_blob->cpu_data() +
            feature_blob->offset(n);    //the features of  which image
        for (int d = 0; d < dim_features; ++d) {
          datum.add_float_data(feature_blob_data[d]);
          (*vec[i]) << feature_blob_data[d] << " ";          // save the features
        }
        (*vec[i]) << std::endl;
        //LOG(ERROR)<< "dim" << dim_features;
        int length = snprintf(key_str, kMaxKeyStrLength, "%010d",
            image_indices[i]);       // key  di ji ge tupian
        string out;
        CHECK(datum.SerializeToString(&out));    // serialize to string
        txns.at(i)->Put(std::string(key_str, length), out);       // put to transaction
        ++image_indices[i];       // key++
        if (image_indices[i] % 1000 == 0) {    // when it reach to 1000 ,we commit it
          txns.at(i)->Commit();
          txns.at(i).reset(feature_dbs.at(i)->NewTransaction());
          LOG(ERROR)<< "Extracted features of " << image_indices[i] <<
              " query images for feature blob " << blob_names[i];
        }
      }  // for (int n = 0; n < batch_size; ++n)
    }  // for (int i = 0; i < num_features; ++i)
  }  // for (int batch_index = 0; batch_index < num_mini_batches; ++batch_index)
  // write the last batch
  for (int i = 0; i < num_features; ++i) {
    if (image_indices[i] % 1000 != 0) {     // commit the last path images
      txns.at(i)->Commit();
    }
    // edit by Zack
      vec[i]->close();
      delete vec[i];

    LOG(ERROR)<< "Extracted features of " << image_indices[i] <<
        " query images for feature blob " << blob_names[i];
    feature_dbs.at(i)->Close();
  }

  LOG(ERROR)<< "Successfully extracted the features!";
  return 0;
}

最后将得到的prob层(即最后得到的概率)存入到了txt中了。

此外对网络结构进行了调整，只需要预测，网络中的参数都可以去掉不要了，,

deploy.prototxt代码如下：

name: "LeNet"
layer {
  name: "mnist"
  type: "Data"
  top: "data"
  top: "label"
  transform_param {
    scale: 0.00390625
  }
  data_param {
    source: "examples/mnist/kaggle/mnist_test_lmdb"
    batch_size: 100
    backend: LMDB
  }
}

layer {
  name: "conv1"
  type: "Convolution"
  bottom: "data"
  top: "conv1"

  convolution_param {
    num_output: 20
    kernel_size: 5
    stride: 1

  }
}
layer {
  name: "pool1"
  type: "Pooling"
  bottom: "conv1"
  top: "pool1"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "conv2"
  type: "Convolution"
  bottom: "pool1"
  top: "conv2"

  convolution_param {
    num_output: 50
    kernel_size: 5
    stride: 1

  }
}
layer {
  name: "pool2"
  type: "Pooling"
  bottom: "conv2"
  top: "pool2"
  pooling_param {
    pool: MAX
    kernel_size: 2
    stride: 2
  }
}
layer {
  name: "ip1"
  type: "InnerProduct"
  bottom: "pool2"
  top: "ip1"

  inner_product_param {
    num_output: 500

  }
}
layer {
  name: "relu1"
  type: "ReLU"
  bottom: "ip1"
  top: "ip1"
}
layer {
  name: "ip2"
  type: "InnerProduct"
  bottom: "ip1"
  top: "ip2"

  inner_product_param {
    num_output: 10

  }
}
layer {
  name: "prob"
  type: "Softmax"
  bottom: "ip2"
  top: "prob"
}
layer {
  name: "accuracy"
  type: "Accuracy"
  bottom: "prob"
  bottom: "label"
  top: "accuracy"
}
layer {
  name: "loss"
  type: "SoftmaxWithLoss"
  bottom: "ip2"
  bottom: "label"
  top: "loss"
}

然后执行

./build/tools/extract_features.binexamples/mnist/kaggle/lenet_iter_10000.caffemodelexamples/mnist/kaggle/deploy.prototxt prob examples/mnist/kaggle/features 280lmdb /home/hadoop/caffe/caffe-master/examples/mnist/kaggle/feature

其中280为迭代次数，因为在deploy.prototxt中batch_size设为了100，故就为总共的测试数据集的大小=28000. /home/hadoop/caffe/caffe-master/examples/mnist/kaggle/feature为最终的提取特征存放在txt保存的路径。examples/mnist/kaggle/lenet_iter_10000.caffemodel为训练的权重参数，examples/mnist/kaggle/deploy.prototxt为网络结构。

4：对得到的txt进行后处理

通过上面三个步骤，我们就可以得到feture0.txt，存放的数据位28000*10大小，对应每个样本属于哪一类发生的概率。然后执行以下matlab代码就可以得到kaggle所需要的提交结果了。最后的准确率为0.98986，排名也提升了400+，great！！

% caffe toolbox, the postprocessing of the data
clear;clc;
feature = load('feature0.txt');
feature = feature';
[~,test_y] = max(feature);
[M,N] = size(test_y);
test_y = test_y - repmat([1], M, N);
test_y = test_y';
M = [(1:length(test_y))' test_y(:)];
csvwrite('test_y3.csv', M);

所有文件代码下载见：稍后给出！

时间： 2024-11-05 21:51:41

DeepLearning to digit recongnizer in kaggle

DeepLearning to digit recongnizer in kaggle

一：DeeplearningToolbox

二：caffe to digit recongnizer

1：将给定csv数据转变成lmdb格式

2：用训练数据进行训练得到model

3：提取测试集prob层的特征。

4：对得到的txt进行后处理

DeepLearning to digit recongnizer in kaggle的相关文章

DeepLearning to digit recognizer in kaggle

kaggle实战记录 =>Digit Recognizer(7月份完全掌握细节及内容）

Kaggle竞赛题目之——Digit Recognizer

KNN算法Hadoop实现及kaggle digit recognition数据测试

【DeepLearning】Exercise: Implement deep networks for digit classification

Kaggle Digit Recognizer 练习

大数据竞赛平台——Kaggle 入门

Kaggle

Kaggle 数据挖掘比赛经验分享