1.setup
# set up Python environment: numpy for numerical routines, and matplotlib for plotting
import numpy as np
import matplotlib.pyplot as plt
# display plots in this notebook
%matplotlib inline # put the image in ipython notebook
# set display defaults
plt.rcParams[‘figure.figsize‘] = (10, 10) # large images
plt.rcParams[‘image.interpolation‘] = ‘nearest‘ # don‘t interpolate: show square pixels
plt.rcParams[‘image.cmap‘] = ‘gray‘ # use grayscale output rather than a (potentially misleading) color heatmap
# load caffe
# The caffe module needs to be on the Python path;
# we‘ll add it here explicitly.
import sys
caffe_root=‘./caffe-master/‘ # this file should be run from {caffe_root}/examples
sys.path.insert(0, caffe_root + ‘python‘)
import caffe # If you get "No module named _caffe", either you have not built pycaffe or you have the wrong path.
If needed, download the reference model (“CaffeNet”, a variant of AlexNet).
import os
if os.path.isfile(caffe_root + ‘models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel‘):
print ‘CaffeNet found.‘
else:
print ‘Downloading pre-trained CaffeNet model...‘
!../scripts/download_model_binary.py ../models/bvlc_reference_caffenet
CaffeNet found.
2. Load net and set up input preprocessing
Set Caffe to CPU mode and load the net from disk.
caffe.set_mode_cpu()
model_def = caffe_root + ‘models/bvlc_reference_caffenet/deploy.prototxt‘
model_weights = caffe_root + ‘models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel‘
net = caffe.Net(model_def, # defines the structure of the model
model_weights, # contains the trained weights
caffe.TEST) # use test mode
Set up input preprocessing. (We’ll use Caffe’s caffe.io.Transformer to do this, but this step is independent of other parts of Caffe, so any custom preprocessing code may be used).
Our default CaffeNet is configured to take images in BGR format.
Values are expected to start in the range [0, 255] and then have the mean ImageNet pixel value subtracted from them. in addition, the channel dimension is expected as the first (outermost) dimension.
As matplotlib will load images with values in the range [0, 1] in RGB format with the channel as the innermost dimension, we are arranging for the needed transformations here.
# load the mean ImageNet image (as distributed with Caffe) for subtraction
mu = np.load(caffe_root + ‘python/caffe/imagenet/ilsvrc_2012_mean.npy‘)
mu = mu.mean(1).mean(1) # average over pixels to obtain the mean (BGR) pixel values
print ‘mean-subtracted values:‘, zip(‘BGR‘, mu)
# create transformer for the input called ‘data‘
transformer = caffe.io.Transformer({‘data‘: net.blobs[‘data‘].data.shape})
transformer.set_transpose(‘data‘, (2,0,1)) # move image channels to outermost dimension
transformer.set_mean(‘data‘, mu) # subtract the dataset-mean value in each channel
transformer.set_raw_scale(‘data‘, 255) # rescale from [0, 1] to [0, 255]
transformer.set_channel_swap(‘data‘, (2,1,0)) # swap channels from RGB to BGR
mean-subtracted values: [(‘B’, 104.0069879317889), (‘G’, 116.66876761696767), (‘R’, 122.6789143406786)]
print net.blobs[‘data‘].data.shape
(1, 3, 227, 227)
3. CPU classification
Now we’re ready to perform classification. Even though we’ll only classify one image, we’ll set a batch size of 50 to demonstrate batching.
# set the size of the input (we can skip this if we‘re happy
# with the default; we can also change it later, e.g., for different batch sizes)
net.blobs[‘data‘].reshape(1, # batch size
3, # 3-channel (BGR) images
227, 227) # image size is 227x227
因为deploy.txt文件中,输入是227x227,所以这里将输入的图片改成了227x227,但是如果换成其他尺寸,则出现错误,不知道为什么。
# Load an image (that comes with Caffe) and perform the preprocessing we‘ve set up.
image = caffe.io.load_image(caffe_root + ‘examples/images/cat.jpg‘)
transformed_image = transformer.preprocess(‘data‘, image)
plt.imshow(image)
# Adorable! Let‘s classify it!
# copy the image data into the memory allocated for the net
net.blobs[‘data‘].data[...] = transformed_image
### perform classification
output = net.forward()
output_prob = output[‘prob‘][0] # the output probability vector for the first image in the batch
print ‘predicted class is:‘, output_prob.argmax()
# The net gives us a vector of probabilities; the most probable class was the 281st one. But is
# that correct? Let‘s check the ImageNet labels...
predicted class is: 281
# load ImageNet labels
labels_file = caffe_root + ‘data/ilsvrc12/synset_words.txt‘
if not os.path.exists(labels_file):
!../data/ilsvrc12/get_ilsvrc_aux.sh
labels = np.loadtxt(labels_file, str, delimiter=‘\t‘)
print ‘output label:‘, labels[output_prob.argmax()]
output label: n02123045 tabby, tabby cat
4. Switching to GPU mode
# Let‘s see how long classification took, and compare it to GPU mode.
%timeit net.forward()
10 loops, best of 3: 38.6 ms per loop
That’s a while, even for a batch of 50 images. Let’s switch to GPU mode.
caffe.set_device(0) # if we have multiple GPUs, pick the first one
caffe.set_mode_gpu()
net.forward() # run once before timing to set up memory
%timeit net.forward()
# That should be much faster!
100 loops, best of 3: 9.45 ms per loop
5. Examining intermediate output
A net is not just a black box; let’s take a look at some of the parameters and intermediate activations.
First we’ll see how to read out the structure of the net in terms of activation and parameter shapes.
For each layer, let’s look at the activation shapes, which typically have the form (batch_size, channel_dim, height, width).
The activations are exposed as an OrderedDict, net.blobs.
# for each layer, show the output shape
for layer_name, blob in net.blobs.iteritems():
print layer_name + ‘\t‘ + str(blob.data.shape)
data (1, 3, 227, 227)
conv1 (1, 96, 55, 55)
pool1 (1, 96, 27, 27)
norm1 (1, 96, 27, 27)
conv2 (1, 256, 27, 27)
pool2 (1, 256, 13, 13)
norm2 (1, 256, 13, 13)
conv3 (1, 384, 13, 13)
conv4 (1, 384, 13, 13)
conv5 (1, 256, 13, 13)
pool5 (1, 256, 6, 6)
fc6 (1, 4096)
fc7 (1, 4096)
fc8 (1, 1000)
prob (1, 1000)
Now look at the parameter shapes. The parameters are exposed as another OrderedDict, net.params.
We need to index the resulting values with either [0] for weights or [1] for biases.
The param shapes typically have the form (output_channels, input_channels, filter_height, filter_width) (for the weights)
and the 1-dimensional shape (output_channels,) (for the biases).
for layer_name, param in net.params.iteritems():
print layer_name + ‘\t‘ + str(param[0].data.shape), str(param[1].data.shape)
conv1 (96, 3, 11, 11) (96,)
conv2 (256, 48, 5, 5) (256,)
conv3 (384, 256, 3, 3) (384,)
conv4 (384, 192, 3, 3) (384,)
conv5 (256, 192, 3, 3) (256,)
fc6 (4096, 9216) (4096,)
fc7 (4096, 4096) (4096,)
fc8 (1000, 4096) (1000,)
Since we’re dealing with four-dimensional data here, we’ll define a helper function for visualizing sets of rectangular heatmaps.
def vis_square(data):
"""Take an array of shape (n, height, width) or (n, height, width, 3) and visualize each (height, width) thing in a grid of size approx. sqrt(n) by sqrt(n)"""
# normalize data for display
data = (data - data.min()) / (data.max() - data.min())
# force the number of filters to be square
n = int(np.ceil(np.sqrt(data.shape[0])))
padding = (((0, n ** 2 - data.shape[0]),
(0, 1), (0, 1)) # add some space between filters
+ ((0, 0),) * (data.ndim - 3)) # don‘t pad the last dimension (if there is one)
data = np.pad(data, padding, mode=‘constant‘, constant_values=1) # pad with ones (white)
# tile the filters into an image
data = data.reshape((n, n) + data.shape[1:]).transpose((0, 2, 1, 3) + tuple(range(4, data.ndim + 1)))
data = data.reshape((n * data.shape[1], n * data.shape[3]) + data.shape[4:])
plt.imshow(data); plt.axis(‘off‘)
First we’ll look at the first layer filters, conv1
# the parameters are a list of [weights]
filters = net.params[‘conv1‘][0].data
vis_square(filters.transpose(0, 2, 3, 1))
# The first layer output, conv1 (rectified responses of the filters above, first 96 only)
# conv1 (96, 3, 11, 11) (96,)
# show the first three filters
vis_square(filters[:96].reshape(96**3, 11, 11))
It show that the first three images corresponding to the first block in the former figure, and 4th, 5th, and 6th image corresponding to the second block in former, and the third one also do this.
# the parameters are a list of biases
filters_b = net.params[‘conv1‘][1].data
# The first layer output, conv1 (rectified responses of the filters above, first 96 only)
# the params in conv1 is (96, 3, 11, 11) (96,)
print filters_b
# show the output after conv1 layer
# conv1 (1, 96, 55, 55)
feat = net.blobs[‘conv1‘].data[0]
vis_square(feat)
# show the output after pool1 layer
# pool1 (1, 96, 27, 27)
feat = net.blobs[‘pool1‘].data[0]
vis_square(feat)
# show the output after norm1 layer
# norm1 (1, 96, 27, 27)
feat = net.blobs[‘norm1‘].data[0]
vis_square(feat)
the norm1 layer make the LRN between the feature map gained from pool1
# the parameters are a list of weights in conv2 layer
filters = net.params[‘conv2‘][0].data
vis_square(filters[:256].reshape(256*48, 5, 5))
since the input is 96, and the group = 2, the number of channel of the filters is 48. According to the sharing weights, the weight make dot production with each layer of feature map is fixed but with different weights used in different layer, so there are 48 different kernels. the output_size=256, so there are 256 filters.
# the parameters are a list of biases.
filters_b = net.params[‘conv2‘][1].data
# vis_square(filters.transpose(0, 2, 3, 1))
# The first layer output, conv1 (rectified responses of the filters above, first 96 only)
print filters_b
conv2 (256, 48, 5, 5) (256,)
# show the result after conv2
feat = net.blobs[‘conv2‘].data[0]
vis_square(feat)
# conv2 (1, 256, 27, 27)
# show the result after pooling2
feat = net.blobs[‘pool2‘].data[0]
vis_square(feat)
# pool2 (1, 256, 13, 13)
# show the result after LRN
feat = net.blobs[‘norm2‘].data[0]
vis_square(feat)
# norm2 (1, 256, 13, 13)
# show the result after conv3
feat = net.blobs[‘conv3‘].data[0]
vis_square(feat)
# conv3 (1, 384, 13, 13)
# show the result after conv4
feat = net.blobs[‘conv4‘].data[0]
vis_square(feat)
# conv4 (1, 384, 13, 13)
# show the result after conv5
feat = net.blobs[‘conv5‘].data[0]
vis_square(feat)
# conv5 (1, 256, 13, 13)
# show the result after pooling layer 5
feat = net.blobs[‘pool5‘].data[0]
vis_square(feat)
# pool5 (1, 256, 6, 6)
The first fully connected layer, fc6 (rectified). We show the output values and the histogram of the positive values.
# show the result after fc6 layer
feat = net.blobs[‘fc6‘].data[0]
plt.subplot(2, 1, 1)
plt.plot(feat.flat)
plt.subplot(2, 1, 2)
_ = plt.hist(feat.flat[feat.flat > 0], bins=100)
# fc6 (1, 4096)
# show the result after fc7
feat = net.blobs[‘fc7‘].data[0]
plt.subplot(2, 1, 1)
plt.plot(feat.flat)
plt.subplot(2, 1, 2)
_ = plt.hist(feat.flat[feat.flat > 0], bins=100)
# fc7 (1, 4096)
the upper image shows the score of each channel, and the lower image shows the number of each score.
# show the result after fc8
feat = net.blobs[‘fc8‘].data[0]
plt.subplot(2, 1, 1)
plt.plot(feat.flat)
plt.subplot(2, 1, 2)
_ = plt.hist(feat.flat[feat.flat > 0], bins=100)
# fc8 (1, 1000)
# show the result after prob layer
feat = net.blobs[‘prob‘].data[0]
plt.figure(figsize=(15, 3))
plt.plot(feat.flat)
# prob (1, 1000)
Note the cluster of strong predictions; the labels are sorted semantically. The top peaks correspond to the top predicted labels, as shown above.