目录
- CCA算法实现和优化
- 代码框架
- 基于并查集的实现与优化
- 0. 最naive的并查集
- 1. 基于并查集的CCA优化1:路径压缩
- 2. 基于并查集的CCA优化2:基于rank做节点合并
- 3. 基于并查集的CCA优化3:邻域数量减半
- 4. 基于并查集的CCA优化4: 消除find函数的递归调用
- 5. 基于并查集的CCA优化5: 就地计算而不调用find函数
- 6. 基于并查集的CCA优化6: 就地计算而不调用union函数
- 7. 基于并查集的CCA优化7: label重新编号优化
- 8. 基于并查集的CCA优化8: 修改并查集节点id对应含义,并利用邻域对称性加速
- 基于DFS算法的实现与优化
- 1. 递归实现的DFS用于计算CCA
- 2. 递归调用改为迭代
- TODO
CCA算法实现和优化
基本思路是两种,一种是dfs,另一种是并查集(two-pass)。先考虑并查集的做法,不优化和优化,在大图(1280*720,~3000个CCA)上,速度相差巨大:30s->4ms (提升接近8000倍)。
代码框架
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "opencv2/opencv.hpp"
#include "fa_log.h"
//16*44
unsigned char test_data[] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,0,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,0,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,3,3,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,3,3,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,0,0,4,4,4,4,4,1,1,1,0,0,5,5,5,5,5,3,3,3,3,3,3,3,3,3,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,0,0,4,4,4,4,4,1,1,1,0,0,5,5,5,5,5,3,3,3,3,3,3,3,3,3,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,0,0,4,4,4,0,0,0,0,0,0,0,5,5,5,0,0,0,0,0,0,0,3,3,3,3,
6,6,6,6,6,6,6,6,0,0,7,7,7,0,0,2,2,2,2,2,2,2,2,0,0,8,8,8,8,8,5,5,5,0,0,9,9,9,9,9,3,3,3,3,
6,6,6,6,6,6,6,6,0,0,7,7,7,0,0,2,2,2,2,2,2,2,2,0,0,8,8,8,8,8,5,5,5,0,0,9,9,9,9,9,3,3,3,3,
6,6,6,0,0,6,6,6,0,0,7,7,7,0,0,2,2,2,0,0,0,0,0,0,0,8,8,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
6,6,6,0,0,6,6,6,0,0,7,7,7,7,7,2,2,2,2,2,2,2,2,2,2,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
6,6,6,0,0,6,6,6,0,0,7,7,7,7,7,2,2,2,0,0,2,2,2,2,2,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,6,6,6,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
};
int cca_by_find_union_set(const unsigned char* image_data, int* label, int h, int w);
cv::Scalar icvprGetRandomColor()
{
uchar r = 255 * (rand() / (1.0 + RAND_MAX));
uchar g = 255 * (rand() / (1.0 + RAND_MAX));
uchar b = 255 * (rand() / (1.0 + RAND_MAX));
//printf("-- get color: (%d, %d, %d)\n", r, g, b);
return cv::Scalar(b, g, r);
}
void icvprLabelColor(const cv::Mat& _labelImg, cv::Mat& _colorLabelImg)
{
if (_labelImg.empty() ||
_labelImg.type() != CV_32SC1)
{
return;
}
std::map<int, cv::Scalar> colors;
int rows = _labelImg.rows;
int cols = _labelImg.cols;
_colorLabelImg.release();
_colorLabelImg.create(rows, cols, CV_8UC3);
_colorLabelImg = cv::Scalar::all(0);
for (int i = 0; i < rows; i++)
{
const int* data_src = (int*)_labelImg.ptr<int>(i);
uchar* data_dst = _colorLabelImg.ptr<uchar>(i);
for (int j = 0; j < cols; j++)
{
int pixelValue = data_src[j];
if (pixelValue > 1)
{
if (colors.count(pixelValue) <= 0)
{
colors[pixelValue] = icvprGetRandomColor();
}
cv::Scalar color = colors[pixelValue];
*data_dst++ = color[0];
*data_dst++ = color[1];
*data_dst++ = color[2];
}
else
{
data_dst++;
data_dst++;
data_dst++;
}
}
}
}
void cca_test(const cv::Mat& binImage, int thresh_type) {
cv::namedWindow("original image", 0);
cv::imshow("original image", binImage);
cv::threshold(binImage, binImage, 50, 1, thresh_type);
cv::Mat labelImg;
binImage.convertTo(labelImg, CV_32SC1);
const unsigned char* image_data = binImage.data;
int* image_label = (int*)labelImg.data;
int rows = binImage.rows;
int cols = binImage.cols;
long t_start = fa_gettime();
int num_cca;
num_cca = cca_by_find_union_set(image_data, image_label, rows, cols);
//num_cca = cca_by_dfs(binImage, labelImg);
long t_consume = fa_gettime() - t_start;
printf("-- cca() time cost is %lu ms\n", t_consume);
printf("-- num of cca is: %d\n", num_cca);
cv::Mat grayImg;
// since we know we only have 1 CCA, its label is 1
// so, we can multiply it for show
labelImg *= 10;
labelImg.convertTo(grayImg, CV_8UC1);
cv::namedWindow("labelImg", 0);
cv::imshow("labelImg", grayImg);
cv::Mat colorLabelImg;
icvprLabelColor(labelImg, colorLabelImg);
cv::namedWindow("colorImg", 0);
cv::imshow("colorImg", colorLabelImg);
cv::waitKey(0);
}
void cca_test_main() {
const char* im_pth;
cv::Mat binImg;
// case1
if (1) {
printf("\ntest case 1: icvpr.jpg\n");
im_pth = "F:/zhangzhuo/dev/libfc/imgs/icvpr.jpg";
binImg = cv::imread(im_pth, 0);
cca_test(binImg, CV_THRESH_BINARY_INV);
}
// case2
if (1) {
printf("\ntest case 2: qingtu.jpg\n");
im_pth = "F:/zhangzhuo/dev/libfc/imgs/qingtu.jpg";
binImg = cv::imread(im_pth, 0);
cca_test(binImg, CV_THRESH_BINARY);
}
// case3
if (1) {
printf("\ntest case 3: cca_big.jpg\n");
im_pth = "F:/zhangzhuo/dev/libfc/imgs/cca_big.jpg";
binImg = cv::imread(im_pth, 0);
cca_test(binImg, CV_THRESH_BINARY);
}
// case3
if (1) {
printf("\ntest case 4: 16x44 handdraw image\n");
int im_h = 16;
int im_w = 44;
for (int i = 0; i < im_h*im_w; i++) {
if (test_data[i] > 0) {
test_data[i] = 255;
}
}
binImg = cv::Mat(cv::Size(im_w, im_h), CV_8UC1);
binImg.data = test_data;
cca_test(binImg, CV_THRESH_BINARY);
}
}
int main() {
cca_test_main();
return 0;
}
基于并查集的实现与优化
0. 最naive的并查集
// fus: find-union-set
// find root of specified node
int fus_find(int x, int* p){
return x == p[x] ? x : fus_find(p[x], p);
}
// merge two nodes by changing its parent node
void fus_union(int a, int b, int* p) {
p[fus_find(a, p)] = fus_find(b, p);
}
int cca_by_find_union_set(const unsigned char* image_data, int* label, int h, int w)
{
// 1. first pass
int cnt = 0;
int i, j, k, idx;
int neighbor_num = 4;
int row, col, neighbor_idx;
int shift_row[4] = { -1, 1, 0, 0 };
int shift_col[4] = { 0, 0, -1, 1 };
int* p = (int*)malloc(sizeof(int)*h*w);
for (i = 0; i < w*h; i++) {
p[i] = i;
}
for (i = 0; i < h; i++) {
for (j = 0; j < w; j++) {
idx = i * w + j;
if (image_data[idx] != 1) continue;
for (k = 0; k < neighbor_num; k++) {
row = i + shift_row[k];
col = j + shift_col[k];
neighbor_idx = row * w + col;
if (row < 0 || row >= h || col < 0 || col >= w || image_data[neighbor_idx] != 1) continue;
fus_union(idx, neighbor_idx, p);
}
}
}
int* bowl = (int*)malloc(sizeof(int)*h*w);
memset(bowl, 0, sizeof(int)*h*w);
int label_cnt = 0;
for (i = 0; i < h*w; i++) {
if (image_data[i] != 1) continue;
int t = fus_find(i, p);
p[i] = t;
if (bowl[t] == 0) {
label_cnt++;
bowl[t] = label_cnt;
}
}
for (i = 0; i < h*w; i++) {
label[i] = bowl[p[i]];
}
free(p); p = NULL;
free(bowl); bowl = NULL;
return label_cnt;
}
VS2017下的时间开销:
图片 | 图像大小 | CCA个数 | 时间 (naive,Release) |
---|---|---|---|
icvpr.jpg | 90*364 | 10 | 6ms |
qintu.jpg | 165*652 | 17 | 9ms |
cca_big.jpg | 720*1280 | 3807 | 31832ms |
本来是想Debug模式测试的,但是cca_big.jpg
是在太慢就用Release模式了。
1. 基于并查集的CCA优化1:路径压缩
并查集的find函数使用路径压缩,Release模式下1280x720大图3087个cca,时间开销从31832ms减少到22ms,Debug模式下为125ms。代码修改很简单,从原来的:
int fus_find(int x, int* p) {
return x == p[x] ? x : fus_find(p[x], p);
}
修改为
int fus_find(int x, int* p) {
return x == p[x] ? x : p[x]=fus_find(p[x], p);
}
图片 | 图像大小 | CCA个数 | 时间 (naive, Release) |
时间 (opt1, Release) |
时间 (opt1, Debug) |
---|---|---|---|---|---|
icvpr.jpg | 90*364 | 10 | 6ms | 1ms | 2ms |
qintu.jpg | 165*652 | 17 | 9ms | 1ms | 4ms |
cca_big.jpg | 720*1280 | 3807 | 31832ms | 22ms | 125ms |
2. 基于并查集的CCA优化2:基于rank做节点合并
这次优化的是并查集的union函数,如果需要合并则根据rank大小来合并,减少了树的层级。代码从:
void fus_union(int a, int b, int* p) {
p[fus_find(a, p)] = fus_find(b, p);
}
修改为:
void fus_union(int a, int b, int* p, int* rank) {
int ra = fus_find(a, p);
int rb = fus_find(b, p);
if (ra == rb) return;
if (rank[ra] > rank[rb]) {
p[rb] = ra;
}
else {
if (rank[ra] == rank[rb]) {
rank[rb]++;
}
p[ra] = rb;
}
}
其中参数rank
需要在主调函数中初始化,数组个数和p
数组个数相同,初始值都设定为1:
int* rank = (int*)malloc(sizeof(int)*h*w);
...
for (i = 0; i < w*h; i++) {
p[i] = i;
rank[i] = 1; //new added
}
for
for
for
fus_union(idx, neighbor_idx, p, rank); //modified
...
free(p); p = NULL;
free(rank); rank = NULL; //new added
free(bowl); bowl = NULL;
时间开销上看,比上一次要快,但是Debug模式下cca_big.jpg
这张图仍然很捉急。
图片 | 图像大小 | CCA个数 | 时间 (naive, Release) |
时间 (opt1, Release) |
时间 (opt1, Debug) |
时间 (opt2, Release) |
时间 (opt, Debug) |
---|---|---|---|---|---|---|---|
icvpr.jpg | 90*364 | 10 | 6ms | 1ms | 2ms | 0ms | 3ms |
qintu.jpg | 165*652 | 17 | 9ms | 1ms | 4ms | 0ms | 3ms |
cca_big.jpg | 720*1280 | 3807 | 31832ms | 22ms | 125ms | 15ms | 97ms |
3. 基于并查集的CCA优化3:邻域数量减半
考虑到遍历整张图像时,左边和上边的像素都访问过了,因而每个像素考察周边元素做union操作时,只需要考虑左方和上方(目前仅考虑4邻域,8邻域与之做法类似)。关键代码修改:
int neighbor_num = 4;
int shift_row[4] = { -1, 1, 0, 0 };
int shift_col[4] = { 0, 0, -1, 1 };
修改为:
int neighbor_num = 2;
int shift_row[2] = { 0, -1 };
int shift_col[2] = {-1, 0 };
图片 | 图像大小 | CCA个数 | 时间 (naive, Release) |
时间 (opt2, Release) |
时间 (opt2, Debug) |
时间 (opt3, Release) |
时间 (opt3, Debug) |
---|---|---|---|---|---|---|---|
icvpr.jpg | 90*364 | 10 | 6ms | 0ms | 3ms | 0ms | 1ms |
qintu.jpg | 165*652 | 17 | 9ms | 0ms | 3ms | 0ms | 2ms |
cca_big.jpg | 720*1280 | 3807 | 31832ms | 15ms | 97ms | 11ms | 57ms |
对于cca_big.jpg
这张图,Release模式下从15ms降到11ms,Debug模式下则从97ms减少到57ms,减少了接近一半。
4. 基于并查集的CCA优化4: 消除find函数的递归调用
使用递归函数的好处是思路清晰,缺点是如果调用层次过于深则容易产生stack overflow(栈溢出)问题。例如VS2017下栈空间大小默认值为1MB(1073741824),需要更大的栈则要修改编译链接选项。消除递归函数同时还能一定程度上减少时间开销。代码修改,从原来的:
int fus_find(int x, int* p) {
return x == p[x] ? x : p[x]=fus_find(p[x], p);
}
修改为:
int fus_find(int x, int* p) {
int a = x;
while (a != p[a]) {
a = p[a];
}
while (x != p[x]) {
x = p[x];
p[x] = a;
}
return a;
}
图片 | 图像大小 | CCA个数 | 时间 (naive, Release) |
时间 (opt3, Release) |
时间 (opt3, Debug) |
时间 (opt4, Release) |
时间 (opt4, Debug) |
---|---|---|---|---|---|---|---|
icvpr.jpg | 90*364 | 10 | 6ms | 0ms | 1ms | 1ms | 1ms |
qintu.jpg | 165*652 | 17 | 9ms | 0ms | 2ms | 1ms | 2ms |
cca_big.jpg | 720*1280 | 3807 | 31832ms | 11ms | 57ms | 9ms | 48ms |
这次优化的结果是,小图片(icvpr.jpg, qingtu.jpg)在Release模式下时间开销从0ms(实际上小于1ms)增加到1ms,增加不明显;而cca_big.jpg
这张大图无论是Release还是Debug模式,时间开销都有减少,分别为9ms和2ms。但是,Debug模式下的48ms还是太慢。
5. 基于并查集的CCA优化5: 就地计算而不调用find函数
前一步,已经把find函数从递归改成非递归;考虑到find函数在union函数和算法主体函数中被多次调用,就地展开计算替代函数调用可以加速计算。
代码修改有两处,第一个是union函数,原来的:
#define OPT5
void fus_union(int a, int b, int* p, int* rank) {
int ra = fus_find(a, p);
int rb = fus_find(b, p);
if (ra == rb) return;
if (rank[ra] > rank[rb]) {
p[rb] = ra;
}
else {
if (rank[ra] == rank[rb]) {
rank[rb]++;
}
p[ra] = rb;
}
}
修改为:
#define OPT5
void fus_union(int a, int b, int* p, int* rank) {
while (a != p[a]) {
a = p[a];
}
int ra = a;
while (b != p[b]) {
b = p[b];
}
int rb = b;
if (ra == rb) return;
if (rank[ra] > rank[rb]) {
p[rb] = ra;
}
else {
if (rank[ra] == rank[rb]) {
rank[rb]++;
}
p[ra] = rb;
}
}
第二处修改:算法主体函数中的后处理部分,原来的:
int t = fus_find(i, p);
修改为:
int t = i;
while (p[t] != t) {
t = p[t];
}
图片 | 图像大小 | CCA个数 | 时间 (naive, Release) |
时间 (opt4, Release) |
时间 (opt4, Debug) |
时间 (opt5, Release) |
时间 (opt5, Debug) |
---|---|---|---|---|---|---|---|
icvpr.jpg | 90*364 | 10 | 6ms | 1ms | 1ms | 0ms | 1ms |
qintu.jpg | 165*652 | 17 | 9ms | 1ms | 2ms | 0ms | 2ms |
cca_big.jpg | 720*1280 | 3807 | 31832ms | 9ms | 48ms | 9ms | 28ms |
Release模式下没有明显变化,Debug模式下大图cca_big.jpg
时间开销从48ms减少到28ms。
6. 基于并查集的CCA优化6: 就地计算而不调用union函数
类似于前一个优化步骤把find函数就地计算而不去调用。现在把union函数就地展开,Debug模式下cca_big.jpg
速度从28ms提升到22ms。不过算法主体函数的双重for循环也变得ugly:
int a, b;
for (i = 0; i < h; i++) {
for (j = 0; j < w; j++) {
idx = i * w + j;
if (image_data[idx] != 1) continue;
for (k = 0; k < neighbor_num; k++) {
row = i + shift_row[k];
col = j + shift_col[k];
neighbor_idx = row * w + col;
if (row < 0 || row >= h || col < 0 || col >= w || image_data[neighbor_idx] != 1) continue;
//fus_union(idx, neighbor_idx, p, rank);
a = idx; b = neighbor_idx;
while (a != p[a]){
a = p[a];
}
int ra = a;
while (b != p[b]) {
b = p[b];
}
int rb = b;
if (ra == rb) continue;
if (rank[ra] > rank[rb]) {
p[rb] = ra;
}
else {
if (rank[ra] == rank[rb]) {
rank[rb]++;
}
p[ra] = rb;
}
}
}
}
图片 | 图像大小 | CCA个数 | 时间 (naive, Release) |
时间 (opt5, Release) |
时间 (opt5, Debug) |
时间 (opt6, Release) |
时间 (opt6, Debug) |
---|---|---|---|---|---|---|---|
icvpr.jpg | 90*364 | 10 | 6ms | 0ms | 1ms | 1ms | 1ms |
qintu.jpg | 165*652 | 17 | 9ms | 0ms | 2ms | 1ms | 1ms |
cca_big.jpg | 720*1280 | 3807 | 31832ms | 9ms | 28ms | 8ms | 22ms |
7. 基于并查集的CCA优化7: label重新编号优化
先前是做两次for循环:
int* bowl = (int*)malloc(sizeof(int)*h*w);
memset(bowl, 0, sizeof(int)*h*w);
for (i = 0; i < h*w; i++) {
if (image_data[i] != 1) continue;
int t = i;
while (p[t] != t) {
t = p[t];
}
p[i] = t;
if (bowl[t] == 0) {
label_cnt++;
bowl[t] = label_cnt;
}
}
for (i = 0; i < h*w; i++) {
label[i] = bowl[p[i]];
}
free(p); p = NULL;
free(rank); rank = NULL;
free(bowl); bowl = NULL;
现在改成一次循环:
int* bowl = (int*)malloc(sizeof(int)*h*w);
memset(bowl, 0, sizeof(int)*h*w);
int x;
for (i = 0; i < h*w; i++) {
if (image_data[i]) {
if (p[i] != i) {
x = i;
while (x != p[x]) {
x = p[x];
}
p[i] = x;
if (bowl[x] == 0) {
label_cnt++;
bowl[x] = label_cnt;
}
label[i] = bowl[x];
}
else {
if (bowl[i] == 0) {
label_cnt++;
bowl[i] = label_cnt;
}
label[i] = bowl[i];
}
}
}
free(bowl); bowl = NULL;
free(p); p = NULL;
free(rank); rank = NULL;
opt6,debug | opt7,debug |
---|---|
1ms | 1ms |
1ms | 1ms |
22ms | 20~21ms |
大图快了1~2ms。这个阶段再优化,似乎要把几个邻域的for循环展开,并且把union()过程拆分合并重新组合。这对于扩展到8邻域似乎不太友好。先贴一下目前的整体代码。
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "opencv2/opencv.hpp"
#include "fa_log.h"
//16*44
unsigned char test_data[] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,0,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,0,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,3,3,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,3,3,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,0,0,4,4,4,4,4,1,1,1,0,0,5,5,5,5,5,3,3,3,3,3,3,3,3,3,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,0,0,4,4,4,4,4,1,1,1,0,0,5,5,5,5,5,3,3,3,3,3,3,3,3,3,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,0,0,4,4,4,0,0,0,0,0,0,0,5,5,5,0,0,0,0,0,0,0,3,3,3,3,
6,6,6,6,6,6,6,6,0,0,7,7,7,0,0,2,2,2,2,2,2,2,2,0,0,8,8,8,8,8,5,5,5,0,0,9,9,9,9,9,3,3,3,3,
6,6,6,6,6,6,6,6,0,0,7,7,7,0,0,2,2,2,2,2,2,2,2,0,0,8,8,8,8,8,5,5,5,0,0,9,9,9,9,9,3,3,3,3,
6,6,6,0,0,6,6,6,0,0,7,7,7,0,0,2,2,2,0,0,0,0,0,0,0,8,8,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
6,6,6,0,0,6,6,6,0,0,7,7,7,7,7,2,2,2,2,2,2,2,2,2,2,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
6,6,6,0,0,6,6,6,0,0,7,7,7,7,7,2,2,2,0,0,2,2,2,2,2,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,6,6,6,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
};
int cca_by_find_union_set(const unsigned char* image_data, int* label, int h, int w);
cv::Scalar icvprGetRandomColor()
{
uchar r = 255 * (rand() / (1.0 + RAND_MAX));
uchar g = 255 * (rand() / (1.0 + RAND_MAX));
uchar b = 255 * (rand() / (1.0 + RAND_MAX));
//printf("-- get color: (%d, %d, %d)\n", r, g, b);
return cv::Scalar(b, g, r);
}
void icvprLabelColor(const cv::Mat& _labelImg, cv::Mat& _colorLabelImg)
{
if (_labelImg.empty() ||
_labelImg.type() != CV_32SC1)
{
return;
}
std::map<int, cv::Scalar> colors;
int rows = _labelImg.rows;
int cols = _labelImg.cols;
_colorLabelImg.release();
_colorLabelImg.create(rows, cols, CV_8UC3);
_colorLabelImg = cv::Scalar::all(0);
for (int i = 0; i < rows; i++)
{
const int* data_src = (int*)_labelImg.ptr<int>(i);
uchar* data_dst = _colorLabelImg.ptr<uchar>(i);
for (int j = 0; j < cols; j++)
{
int pixelValue = data_src[j];
if (pixelValue > 1)
{
if (colors.count(pixelValue) <= 0)
{
colors[pixelValue] = icvprGetRandomColor();
}
cv::Scalar color = colors[pixelValue];
*data_dst++ = color[0];
*data_dst++ = color[1];
*data_dst++ = color[2];
}
else
{
data_dst++;
data_dst++;
data_dst++;
}
}
}
}
void cca_test(const cv::Mat& binImage, int thresh_type) {
cv::namedWindow("original image", 0);
cv::imshow("original image", binImage);
cv::threshold(binImage, binImage, 50, 1, thresh_type);
cv::Mat labelImg;
binImage.convertTo(labelImg, CV_32SC1);
const unsigned char* image_data = binImage.data;
int* image_label = (int*)labelImg.data;
int rows = binImage.rows;
int cols = binImage.cols;
long t_start = fa_gettime();
int num_cca;
num_cca = cca_by_find_union_set(image_data, image_label, rows, cols);
//num_cca = cca_by_dfs(binImage, labelImg);
long t_consume = fa_gettime() - t_start;
printf("-- cca() time cost is %lu ms\n", t_consume);
printf("-- num of cca is: %d\n", num_cca);
cv::Mat grayImg;
// since we know we only have 1 CCA, its label is 1
// so, we can multiply it for show
labelImg *= 10;
labelImg.convertTo(grayImg, CV_8UC1);
cv::namedWindow("labelImg", 0);
cv::imshow("labelImg", grayImg);
cv::Mat colorLabelImg;
icvprLabelColor(labelImg, colorLabelImg);
cv::namedWindow("colorImg", 0);
cv::imshow("colorImg", colorLabelImg);
cv::waitKey(0);
}
void cca_test_main() {
const char* im_pth;
cv::Mat binImg;
// case1
if (1) {
printf("\ntest case 1: icvpr.jpg\n");
im_pth = "F:/zhangzhuo/dev/libfc/imgs/icvpr.jpg";
binImg = cv::imread(im_pth, 0);
cca_test(binImg, CV_THRESH_BINARY_INV);
}
// case2
if (1) {
printf("\ntest case 2: qingtu.jpg\n");
im_pth = "F:/zhangzhuo/dev/libfc/imgs/qingtu.jpg";
binImg = cv::imread(im_pth, 0);
cca_test(binImg, CV_THRESH_BINARY);
}
// case3
if (1) {
printf("\ntest case 3: cca_big.jpg\n");
im_pth = "F:/zhangzhuo/dev/libfc/imgs/cca_big.jpg";
binImg = cv::imread(im_pth, 0);
cca_test(binImg, CV_THRESH_BINARY);
}
// case3
if (1) {
printf("\ntest case 4: 16x44 handdraw image\n");
int im_h = 16;
int im_w = 44;
for (int i = 0; i < im_h*im_w; i++) {
if (test_data[i] > 0) {
test_data[i] = 255;
}
}
binImg = cv::Mat(cv::Size(im_w, im_h), CV_8UC1);
binImg.data = test_data;
cca_test(binImg, CV_THRESH_BINARY);
}
}
// int fus_find(int x, int* p) {
// int a = x;
// while (a != p[a]) {
// a = p[a];
// }
// while (x != p[x]) {
// x = p[x];
// p[x] = a;
// }
// return a;
// }
// void fus_union(int a, int b, int* p, int* rank) {
// int x, t;
// x = a;
// while (x != p[x]) {
// x = p[x];
// }
// int ra = x;
// x = a;
// while (x != p[x]) {
// t = p[x];
// p[x] = ra;
// x = t;
// }
// p[a] = ra;
// x = b;
// while (x != p[x]) {
// x = p[x];
// }
// int rb = x;
// x = b;
// while (x != p[x]) {
// t = p[x];
// p[x] = rb;
// x = t;
// }
// p[b] = rb;
// if (ra == rb) return;
// if (rank[ra] > rank[rb]) {
// p[rb] = ra;
// }
// else {
// if (rank[ra] == rank[rb]) {
// rank[rb]++;
// }
// p[ra] = rb;
// }
// }
int cca_by_find_union_set(const unsigned char* image_data, int* label, int h, int w)
{
// 1. first pass
int cnt = 0;
int i, j, k, idx;
int row, col, neighbor_idx;
//int neighbor_num = 4;
//int shift_row[4] = { -1, 1, 0, 0 };
//int shift_col[4] = { 0, 0, -1, 1 };
int neighbor_num = 2;
int shift_row[2] = { 0, -1 };
int shift_col[2] = {-1, 0 };
int* p = (int*)malloc(sizeof(int)*h*w);
int* rank = (int*)malloc(sizeof(int)*h*w);
for (i = 0; i < w*h; i++) {
p[i] = i;
rank[i] = 1;
}
int a, b;
for (i = 0; i < h; i++) {
for (j = 0; j < w; j++) {
idx = i * w + j;
if (image_data[idx] != 1) continue;
for (k = 0; k < neighbor_num; k++) {
row = i + shift_row[k];
col = j + shift_col[k];
neighbor_idx = row * w + col;
if (row < 0 || row >= h || col < 0 || col >= w || image_data[neighbor_idx] != 1) continue;
a = idx; b = neighbor_idx;
while (a != p[a]) {
a = p[a];
}
int ra = a;
while (b != p[b]) {
b = p[b];
}
int rb = b;
if (ra == rb) continue;
if (rank[ra] > rank[rb]) {
p[rb] = ra;
}
else {
if (rank[ra] == rank[rb]) {
rank[rb]++;
}
p[ra] = rb;
}
}
}
}
int label_cnt = 0;
int* bowl = (int*)malloc(sizeof(int)*h*w);
memset(bowl, 0, sizeof(int)*h*w);
int x;
for (i = 0; i < h*w; i++) {
if (image_data[i]) {
if (p[i] != i) {
x = i;
while (x != p[x]) {
x = p[x];
}
p[i] = x;
if (bowl[x] == 0) {
label_cnt++;
bowl[x] = label_cnt;
}
label[i] = bowl[x];
}
else {
if (bowl[i] == 0) {
label_cnt++;
bowl[i] = label_cnt;
}
label[i] = bowl[i];
}
}
}
free(bowl); bowl = NULL;
free(p); p = NULL;
free(rank); rank = NULL;
return label_cnt;
}
int main() {
cca_test_main();
return 0;
}
8. 基于并查集的CCA优化8: 修改并查集节点id对应含义,并利用邻域对称性加速
前面的做法,都是把像素id作为并查集的节点id。这看起来的确符合等价类的性质:初始时p[x]=x,满足自反性,每个节点都是自己所在等价类的编号。
现在换个做法,把每个像素点对应的label作为并查集的节点id,则并查集现在是对于label做等价类计算与存储。则像素对应的id(1维数组索引)不再被绑定在一起。
此外,考虑到4邻域、8邻域都是对称结构。利用二重循环遍历图像像素时,每个像素被遍历过的邻域和没有被遍历过的邻域是对称的。则只需要考虑遍历过的邻域即可,理论上看起来计算量大概减半的样子。
主要参考了 C代码二值图像连通区域标记 这篇博客,我修改后速度略慢于改实现,但能够处理8邻域,代码看起来可读性也更高。
代码如下:
int cca_by_find_union_set(const unsigned char* image_data, int* label, int h, int w)
{
// 1. first pass
int i, j, k;
//int neighbor_num = 2; //4 neighborhood only need to consider two previous pixels
int neighbor_num = 4; //8 neighborhood only need to consider 4 previous pixels
int neighbor[] = { -1, -w, -w-1, -w+1};
int shift_row[] = { 0, -1, -1, -1};
int shift_col[] = {-1, 0, -1, 1};
size_t buf_sz = sizeof(int)*h*w;
int* p = (int*)malloc(buf_sz);
int* rank = (int*)malloc(buf_sz);
memset(p, 0, buf_sz);
memset(rank, 0, buf_sz);
memset(label, 0, buf_sz);
int lb = 1;
const unsigned char* bw = image_data;
int* mark = label;
int a, b, t, ra, rb, row, col;
for (i = 0; i < h; i++, bw+=w, mark+=w) {
for (j = 0; j < w; j++) {
if (!bw[j]) continue;
bool has_neighbor = false;
for (k = 0; k < neighbor_num; k++) {
// ignore invalid neighbors
row = i + shift_row[k];
col = j + shift_col[k];
if (row < 0 || row >= h || col < 0 || col >= w) continue;
// if neighbor is labeled 0, ignore this neighbor
a = mark[j+neighbor[k]];
if (!a) continue;
// current pixel is not labeled, and neighbor is labeled
// label current pixel according to this neighbor
if (!mark[j]) {
t = a;
while (a != p[a]) {
a = p[a];
}
p[t] = a;
mark[j] = a;
}
else if (mark[j] != a) {
// merge neighbor's label with current pixel's label
mark[j + neighbor[k]] = mark[j];
t = a;
while (a != p[a]) {
a = p[a];
}
p[t] = a;
b = mark[j];
//if (a < b) p[b] = a; else p[a] = b;
// merge according to rank. seems no boost for speed
if (rank[a] > rank[b]) {
p[b] = a;
}
else {
if (rank[a] == rank[b]) {
rank[b]++;
}
p[a] = b;
}
}
has_neighbor = true;
}
if (!has_neighbor) {
mark[j] = lb;
p[lb] = lb;
lb++;
}
}
}
// 2. second pass
int label_cnt = 0;
for (i = 1; i < lb; i++) {
if (i == p[i]) {
label_cnt++;
p[i] = -label_cnt;
}
}
for (i = 1; i < lb; i++) {
t = i;
//find root of i
while (p[t] >= 0) {
t = p[t];
}
// set the value of label i
p[i] = p[t];
}
//negative to positive
for (i = 1; i < lb; i++) {
p[i] = -p[i];
}
//replace existing label values by the corresponding root label values
mark = label;
for (i = 0; i < h; i++, mark += w) {
for (j = 0; j < w; j++) {
mark[j] = p[mark[j]];
}
}
free(p); p = NULL;
free(rank); rank = NULL;
return label_cnt;
}
opt6,debug | opt7,debug | opt8,debug,8邻 | opt8,release,8邻 | opt8,debug,4邻 | opt8,release,4邻 | |
---|---|---|---|---|---|---|
1ms | 1ms | 1ms | 0ms | 0ms | 0ms | |
1ms | 1ms | 1ms | 1ms | 1ms | 1ms | |
22ms | 20~21ms | 17ms | 6ms | 12ms | 4ms |
基于DFS算法的实现与优化
1. 递归实现的DFS用于计算CCA
DFS的经典题目是UVA572 Oil Deposits,直接递归调用DFS,写起来快,直接AC。稍微修改一下,就能处理CCA问题。和前面基于并查集的实现使用相同的测试图片,测试时间也比较快:
debug,8邻 | release,8邻 | debug,4邻 | release,4邻 |
---|---|---|---|
1ms | 0ms | 0ms | 1ms |
2ms | 0ms | 1ms | 0ms |
67ms | 28ms | 46ms | 24ms |
唯一需要说明的问题是:递归调用次数很多,VS2017下默认1M的占空间根本不够用,我开的是3073741824大小。
void cca_dfs(int x, int y, int label_cnt, int* vis, const unsigned char* image_data, int h, int w)
{
if (x < 0 || x >= h || y < 0 || y >= w) return;
int idx = x * w + y;
if (vis[idx] > 0 || image_data[idx] != 1) return;
vis[idx] = label_cnt;
for (int i = -1; i <= 1; i++) {
for (int j = -1; j <= 1; j++) {
if ((i != 0 || j != 0)) { //8邻域
//if ((i*i+j*j!=2) && (i != 0 || j != 0)) { //4邻域
cca_dfs(x + i, y + j, label_cnt, vis, image_data, h, w);
}
}
}
}
int cca_by_dfs(const unsigned char* image_data, int* label, int h, int w)
{
int i, j, k, idx;
int label_cnt = 0;
int* vis = (int*)malloc(sizeof(int)*h*w);
memset(vis, 0, sizeof(int)*h*w);
for (i = 0; i < h; i++) {
for (j = 0; j < w; j++) {
idx = i * w + j;
if (vis[idx] == 0 && image_data[idx] == 1) {
cca_dfs(i, j, ++label_cnt, vis, image_data, h, w);
}
}
}
for (i = 0; i < h; i++) {
for (j = 0; j < w; j++) {
idx = i * w + j;
label[idx] = vis[idx];
}
}
free(vis);
vis = NULL;
return label_cnt;
}
2. 递归调用改为迭代
初步尝试了下,时间开销反而变大了:40ms->1000ms。可怕。
TODO
和OpenCV中的实现进行性能比较。
原文地址:https://www.cnblogs.com/zjutzz/p/11048377.html