LDA主题模型学习笔记5:C源码理解

1,说明

本文对LDA原始论文的作者所提供的C代码中LDA的主要逻辑部分做注释,代码可在这里下载:https://github.com/Blei-Lab/lda-c

这份代码实现论文《Latent Dirichlet Allocation》中介绍的LDA模型,用变分EM算法求解参数。

为了使代码在vs2013中运行做了一些微小改动,但不影响原代码的逻辑。

vs2013工程可在我的资源中下载:

http://download.csdn.net/detail/happyer88/8861773

----------------------------------------------------------------------

2,准备知识

1,LDA原理及推导

《Latent Dirichlet Allocation》论文

我的LDA学习笔记1-4系列

2,充分统计量

https://en.wikipedia.org/wiki/Sufficient_statistic

----------------------------------------------------------------------

3,代码注释

3.1 main.c

原代码中main函数在lda-estimate.c中,创建vs工程时把它挪到main.c中了。

<span style="font-size:14px;">#include  <stdio.h>
#include  <stdlib.h>
#include  <io.h>
#include<time.h>
#include "cokus.h"
#include "lda-alpha.h"
#include"lda-data.h"
#include"lda-estimate.h"
#include"lda-inference.h"
#include"lda.h"
#include"utils.h"

char * datasetName	= "scene8";	//数据集名字,必须与文件夹名字相同
int expec = 1;		// expec==1,expect , inf
int vocabularySize_global = 512; // 字典大小
int k = 100; //topic的数目
char* params ="../settings.txt"; //估计:估计过程需要的参数
char* params1 ="../inf-settings.txt";     //推断:推断过程需要的参数
char  dataset_train[500];	//估计:估计参数的数据文件
char  dataset_test[500];   //推断:推断的数据文件
char dir_trainData[500];    //估计:估计的中间数据和最终数据文件夹路径
char dir_testData[500];		//推断:推断的中间数据和最终数据文件夹路径
char model_pre[500];
void assignParameter();
int main()
{
	corpus* corpus;
	clock_t start,finish;
	double totaltime;
	long double totaltime_EMiteration;
	assignParameter();
	//myCreateDirectory();

	start=clock();
	if(expec)
	{
		INITIAL_ALPHA = 1;    //狄利克雷分布的参数alpha
		NTOPICS =k;           //主题个数
		read_settings(params);  //读取参数。。。最大迭代次数,收敛条件阈值;EM的最大迭代次数、收敛条件阈值;
		corpus = read_data(dataset_train); //读取数据。。。数据格式:(每一行)在一个文档中出现的word总数目(去掉次数=0的)index_word1:counts index_word2:counts  ...........

		totaltime_EMiteration = run_em("seeded", dir_trainData, corpus); //求解参数。。。EM过程求解参数--输入:中间数据和最终数据存放目录、语料库

		printf("inferencing test images!\n");
		read_settings(params1);
		corpus = read_data(dataset_test);
		infer(model_pre, dir_testData, corpus);
		//用完开始释放
	}
	else
	{
		read_settings(params1);
		corpus = read_data(dataset_test);
		infer(model_pre, dir_testData, corpus);
	}
	finish=clock();
	totaltime=(double)(finish-start)/CLOCKS_PER_SEC;

	printf("nTopic = %d, nTerm = %d estimation time: \n", k, vocabularySize_global);
	printf("  EM iteration takes %f seconds(this is %f miniutes)\n", totaltime_EMiteration*60, totaltime_EMiteration);

	printf("Running Time(--estimate trainData and inference trainData and testData--):%f\n",totaltime);

	printf("\ntrain--- final data are saved to: %s\n", dir_trainData);
	printf("test---- final data are saved to: %s\n", dir_testData);
	getch();
	return(0);
}

void assignParameter()
{
	sprintf(dataset_train,"../train.txt");
	sprintf(dataset_test,"../test.txt");

	sprintf(dir_trainData, "../ResultData");
	sprintf(dir_testData, "../ResultData");
	sprintf(model_pre, "../ResultData/final");

}
</span>

3.2 lda.h

自定义数据结构

#ifndef LDA_H
#define LDA_H

typedef struct
{
    int* words; //文档中的单词,这里存的是该单词在文档集字典中的ID
    int* counts; //每个单词文档中出现次数
    int length; //文档中出现的单词个数,去重的,也就是重复出现的单词不计
    int total;  //文档中总单词数,不去重
} document;

typedef struct
{
    document* docs;
    int num_terms; //文档集中出现的单词个数,去重的,也就是文档集字典大小
    int num_docs; //文档集中文档个数
} corpus;

typedef struct
{
    double alpha; //论文中的模型参数alpha,本来应该是k维,程序中实现的是对称分布的Dirichlet,k维的值是相同的
    double** log_prob_w; //论文中的模型参数beta,每一行存一个主题的词分布,维<span style="font-family: Arial, Helvetica, sans-serif;">度k*V</span>
    int num_topics; //主题个数
    int num_terms;
} lda_model;

typedef struct
{
    double** class_word;//模型参数beta的充分统计量,维度:主题个数*文档集字典大小(K*V)
    double* class_total;//存主题分布z的 充分统计量,维度:主题个数K
    double alpha_suffstats;  //模型参数alpha的充分统计量
    int num_docs;
} lda_suffstats;

#endif

3.3 lda-model.c

主要是初始化lda模型(有三种方法),一种是所有值都为0,‘random‘是用随机数,‘seeded‘是随机挑选一些文档来初始化模型

还有计算模型参数alpha , beta (lda_mle)

#include "lda-model.h"

/*
 * compute MLE lda model from sufficient statistics
 *
 */

void lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha)
{
    int k; int w;

    for (k = 0; k < model->num_topics; k++)
    {
        for (w = 0; w < model->num_terms; w++)
        {
            if (ss->class_word[k][w] > 0)
            {
				//log_prob_w是模型参数beta,主题-词分布
				//class_word和class_total都是充分统计量(sufficient statistic)
				//所以log相减是在做归一化,beta中的值是概率,要在0-1之间
                model->log_prob_w[k][w] =
                    log(ss->class_word[k][w]) -
                    log(ss->class_total[k]);
            }
            else
                model->log_prob_w[k][w] = -100;
        }
    }
    if (estimate_alpha == 1)
    {
		//用牛顿方法优化得到alpha
		//注意这里alpha_suffstats的值
        model->alpha = opt_alpha(ss->alpha_suffstats,
                                 ss->num_docs,
                                 model->num_topics);

        printf("new alpha = %5.5f\n", model->alpha);
    }
}

/*
 * allocate sufficient statistics
 *
 */

lda_suffstats* new_lda_suffstats(lda_model* model)
{
    int num_topics = model->num_topics;
    int num_terms = model->num_terms;
    int i,j;

    lda_suffstats* ss = malloc(sizeof(lda_suffstats));
    ss->class_total = malloc(sizeof(double)*num_topics);
    ss->class_word = malloc(sizeof(double*)*num_topics);
    for (i = 0; i < num_topics; i++)
    {
		ss->class_total[i] = 0;
		ss->class_word[i] = malloc(sizeof(double)*num_terms);
		for (j = 0; j < num_terms; j++)
		{
			ss->class_word[i][j] = 0;
		}
    }
    return(ss);
}
void free_lda_ss(lda_suffstats* ss, lda_model* model)
{
	int i=0;
	for (i=0; i < model->num_topics; i++)
		free(ss->class_word[i]);
	free(ss->class_word);
	free(ss->class_total);
	free(ss);
}

/*
 * various intializations for the sufficient statistics
 *
 */

void zero_initialize_ss(lda_suffstats* ss, lda_model* model)
{
    int k, w;
    for (k = 0; k < model->num_topics; k++)
    {
        ss->class_total[k] = 0;
        for (w = 0; w < model->num_terms; w++)
        {
            ss->class_word[k][w] = 0;
        }
    }
    ss->num_docs = 0;
    ss->alpha_suffstats = 0;
}

void random_initialize_ss(lda_suffstats* ss, lda_model* model)
{
    int num_topics = model->num_topics;
    int num_terms = model->num_terms;
    int k, n;
    for (k = 0; k < num_topics; k++)
    {
        for (n = 0; n < num_terms; n++)
        {
            ss->class_word[k][n] += 1.0/num_terms + myrand();
            ss->class_total[k] += ss->class_word[k][n];
        }
    }
}

void corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c)
{
    int num_topics = model->num_topics;
    int i, k, d, n;
    document* doc;

    for (k = 0; k < num_topics; k++)//每个主题用一些文档的来初始化其主题-词 分布 的充分统计量
    {
        for (i = 0; i < NUM_INIT; i++)//在文档集中随机挑选NUM_INIT=1个文档
        {
            d = floor(myrand() * c->num_docs); //随机挑选
            printf("initialized with document %d\n", d);
            doc = &(c->docs[d]);
            for (n = 0; n < doc->length; n++)
            {
				//将NUM_INIT个文档的词频统计,作为第k个主题的词分布的统计量
                ss->class_word[k][doc->words[n]] += doc->counts[n];
            }
        }
        for (n = 0; n < model->num_terms; n++)
        {
            ss->class_word[k][n] += 1.0;//因为后面要对它求log,所以值必须大于0
			//是对class_word按行求和的结果,是主题k被选中的次数,也就是该主题下的词出现次数的和
            ss->class_total[k] = ss->class_total[k] + ss->class_word[k][n];
        }
		//这样用文档的词频信息初始化,total必然不为0
		//if (ss->class_total[k] == 0)
		//	ss->class_total[k] = 1;
    }
}

/*
 * allocate new lda model
 *
 */

lda_model* new_lda_model(int num_terms, int num_topics)
{
    int i,j;
    lda_model* model;

    model = malloc(sizeof(lda_model));
    model->num_topics = num_topics;
    model->num_terms = num_terms;
    model->alpha = 1.0;
    model->log_prob_w = malloc(sizeof(double*)*num_topics);
    for (i = 0; i < num_topics; i++)
    {
	model->log_prob_w[i] = malloc(sizeof(double)*num_terms);
	for (j = 0; j < num_terms; j++)
	    model->log_prob_w[i][j] = 0;
    }
    return(model);
}

/*
 * deallocate new lda model
 *
 */

void free_lda_model(lda_model* model)
{
    int i;

    for (i = 0; i < model->num_topics; i++)
    {
		free(model->log_prob_w[i]);
    }
    free(model->log_prob_w);
	free(model);
}

/*
 * save an lda model
 *
 */

void save_lda_model(lda_model* model, char* model_root)
{
    char filename[100];
    FILE* fileptr;
    int i, j;

    sprintf(filename, "%s.beta", model_root);
    fileptr = fopen(filename, "w");
    for (i = 0; i < model->num_topics; i++)
    {
		for (j = 0; j < model->num_terms; j++)
		{
			fprintf(fileptr, " %5.10f", model->log_prob_w[i][j]);
		}
		fprintf(fileptr, "\n");
    }
    fclose(fileptr);

    sprintf(filename, "%s.other", model_root);
    fileptr = fopen(filename, "w");
    fprintf(fileptr, "num_topics %d\n", model->num_topics);
    fprintf(fileptr, "num_terms %d\n", model->num_terms);
    fprintf(fileptr, "alpha %5.10f\n", model->alpha);
    fclose(fileptr);
}

lda_model* load_lda_model(char* model_root)
{
    char filename[100];
    FILE* fileptr;
    int i, j, num_terms, num_topics;
    float x, alpha;
	lda_model* model;

    sprintf(filename, "%s.other", model_root);
    printf("loading %s\n", filename);
    fileptr = fopen(filename, "r");
    fscanf(fileptr, "num_topics %d\n", &num_topics);
    fscanf(fileptr, "num_terms %d\n", &num_terms);
    fscanf(fileptr, "alpha %f\n", &alpha);
    fclose(fileptr);

    model = new_lda_model(num_terms, num_topics);
    model->alpha = alpha;

    sprintf(filename, "%s.beta", model_root);
    printf("loading %s\n", filename);
    fileptr = fopen(filename, "r");
    for (i = 0; i < num_topics; i++)
    {
        for (j = 0; j < num_terms; j++)
        {
            fscanf(fileptr, "%f", &x);
            model->log_prob_w[i][j] = x;
        }
    }
    fclose(fileptr);
    return(model);
}

3.3 lda-estimate.c

其中包含和模型求解相关的函数,em算法(run_em)和e-step(doc_e_step)

#include "lda-estimate.h"

/*
 * perform inference on a document and update sufficient statistics
 *
 */
int LAG=5;
double doc_e_step(document* doc, double* gamma, double** phi,
                  lda_model* model, lda_suffstats* ss)
{
    double likelihood;
    int n, k;
	double gamma_sum;
    // posterior inference

    likelihood = lda_inference(doc, model, gamma, phi);

    // update sufficient statistics

	//这里更新alpha的 充分统计量
	//alpha_suffstats = sum(digamma(gamma)) - K*digamma(gamm_sum)
    gamma_sum = 0;
    for (k = 0; k < model->num_topics; k++)
    {
        gamma_sum += gamma[k];
        ss->alpha_suffstats += digamma(gamma[k]); //log gamma函数的一阶导数
    }
    ss->alpha_suffstats -= model->num_topics * digamma(gamma_sum);

    for (n = 0; n < doc->length; n++)
    {
        for (k = 0; k < model->num_topics; k++)
        {
			//phi[n][k]是第n个word由第k个主题生成的概率,在log space
            ss->class_word[k][doc->words[n]] += doc->counts[n]*phi[n][k];
            ss->class_total[k] += doc->counts[n]*phi[n][k];
        }
    }
	//加入充分统计量的文档数
    ss->num_docs = ss->num_docs + 1;

    return(likelihood);
}

/*
 * writes the word assignments line for a document to a file
 *
 */

int write_word_assignment(FILE* result, FILE* f, document* doc, double** phi, lda_model* model)
{
	int n;
	//f中保存phi, result中保存结果:[wordID:概率最大的topicID]
	fprintf(result, "%03d", doc->length);
	for (n = 0; n < doc->length; n++)
	{
		int k;
		for (k=0;k< model->num_topics;k++)
			fprintf(f, "%f\t",phi[n][k]); //一行对应一个word由每个topic生成的概率
		fprintf(f, "\n");

		fprintf(result, " %04d:%02d",
			doc->words[n], argmax(phi[n], model->num_topics));//argmax 找出phi[n]中最大的元素对应的索引位置,也就是topicID
	}
	fprintf(result, "\n");  //一行对应一个文档的 每个word对应的概率最大的topic
	fflush(f);
	fflush(result);
	return 0;

}

/*
 * saves the gamma parameters of the current dataset
 *
 */

void save_gamma(char* filename, double** gamma, int num_docs, int num_topics)
{
    FILE* fileptr;
    int d, k;
    fileptr = fopen(filename, "w");

    for (d = 0; d < num_docs; d++)
    {
	fprintf(fileptr, "%5.10f", gamma[d][0]);
	for (k = 1; k < num_topics; k++)
	{
	    fprintf(fileptr, " %5.10f", gamma[d][k]);
	}
	fprintf(fileptr, "\n");
    }
    fclose(fileptr);
}

/*
 * run_em
 *
 */

long double  run_em(char* start, char* directory, corpus* corpus)
{
	clock_t start_EM, finish_EM;
	double * theta;
	FILE * thetaFile;

    int d, n;
    lda_model *model = NULL;
    double **var_gamma, **phi;
	FILE* likelihood_file;
	int max_length;
	char filename[500];
	char filename1[500];
	int i;
	double likelihood, likelihood_old, converged;
	lda_suffstats* ss;
	FILE* w_asgn_file;
	FILE* result;

    // allocate variational parameters
	        //为变分参数gamma分配空间,维度:文档数*主题数
    var_gamma = malloc(sizeof(double*)*(corpus->num_docs));
    for (d = 0; d < corpus->num_docs; d++)
		var_gamma[d] = malloc(sizeof(double) * NTOPICS);
	        //为变分参数phi分配空间,维度:文档集中文档的最大单词数(去重) * 主题数
    max_length = max_corpus_length(corpus);
    phi = malloc(sizeof(double*)*max_length);
    for (n = 0; n < max_length; n++)
		phi[n] = malloc(sizeof(double) * NTOPICS);

    // initialize model
    ss = NULL;
    if (strcmp(start, "seeded")==0)
    {
        model = new_lda_model(corpus->num_terms, NTOPICS);
        ss = new_lda_suffstats(model);
        corpus_initialize_ss(ss, model, corpus);  //初始化tw分布
        lda_mle(model, ss, 0);  //compute MLE lda model from sufficient statistics
        model->alpha = INITIAL_ALPHA;
    }
    else if (strcmp(start, "random")==0)
    {
        model = new_lda_model(corpus->num_terms, NTOPICS);
        ss = new_lda_suffstats(model);
        random_initialize_ss(ss, model);
        lda_mle(model, ss, 0);
        model->alpha = INITIAL_ALPHA;
    }
    else
    {
        model = load_lda_model(start);
        ss = new_lda_suffstats(model);
    }

    sprintf(filename,"%s/000",directory);
    save_lda_model(model, filename);

    // run expectation maximization

    i = 0;
    likelihood_old = 0;
	converged = 1;
    sprintf(filename, "%s/likelihood.dat", directory);
    likelihood_file = fopen(filename, "w");

	start_EM = clock();
	//em迭代继续执行条件:以下1和2同时满足
	//1,
	//converaged<0 也就是新值比旧值好
	//或converaged>EM_CONVERGED 新值和旧值还不够近似
	//或迭代步骤执行太少(<=2)
	//2,
	//当前迭代step数在规定的最大迭代步数以内
	//或者没有指定最大迭代步数(-1)
    while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) &&  ((i <= EM_MAX_ITER) || (EM_MAX_ITER == -1))   )
    {
        i++; printf("**** em iteration %d ****\n", i);
        likelihood = 0;
        zero_initialize_ss(ss, model); //把统计量的值都赋为0

        // e-step
	//固定alpha和beta,对每一篇文档找到优化的gamma和phi,更新充分统计量,计算似然
        for (d = 0; d < corpus->num_docs; d++)
        {
            if ((d % 10) == 0) printf("document %d in %d EM iteration\n",d, i);
            likelihood += doc_e_step(&(corpus->docs[d]),
                                     var_gamma[d],
                                     phi,
                                     model,
                                     ss);
        }

        // m-step

	//根据当前的充分统计量,更新模型参数alpha,beta
        lda_mle(model, ss, ESTIMATE_ALPHA);

        // check for convergence

        converged = (likelihood_old - likelihood) / (likelihood_old);
        if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2;
        likelihood_old = likelihood;

        // output model and likelihood

        fprintf(likelihood_file, "%10.10f\t%5.5e\n", likelihood, converged);
        fflush(likelihood_file);
        if ((i % LAG) == 0)
        {
            sprintf(filename,"%s/%03d",directory, i);
            save_lda_model(model, filename);
            sprintf(filename,"%s/%03d.gamma",directory, i);
            save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics);
        }
    }
	//EM迭代结束
	finish_EM = clock();
	printf("nTopic = %d, nTerm = %d estimation time: \n", model->num_topics, model->num_terms);
	printf("  EM iteration takes %f seconds(this is %d miniutes)\n", (double)(finish_EM-start_EM)/CLOCKS_PER_SEC, (finish_EM-start_EM)/CLOCKS_PER_SEC/60);

    // output the final model
    sprintf(filename,"%s/final",directory);
    save_lda_model(model, filename);   //此函数中保存了beta到final.beta文件  , 还有.other文件
    sprintf(filename,"%s/final.gamma",directory);
    save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics);

	// output theta
	theta = (double*)malloc(sizeof(double)*model->num_topics);
	sprintf(filename1, "%s/final.theta", directory);
	thetaFile = fopen(filename1, "w");
	// output the word assignments (for visualization)
	sprintf(filename1, "%s/result-doc-assgn.dat", directory);
	result = fopen(filename1, "w");
    for (d = 0; d < corpus->num_docs; d++)
    {
	sprintf(filename, "%s/result_%d_phi.dat", directory,d);          //调试这一部分有越界的错误,完毕,filename数组空间太小。
	w_asgn_file = fopen(filename, "w");
        printf("final e step document %d\n",d);
        likelihood += lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi);
	write_word_assignment(result, w_asgn_file, &(corpus->docs[d]), phi, model);
	computeTheta(  thetaFile,  &(corpus->docs[d]), phi,  model,  theta);
	fclose(w_asgn_file);
    }
    fclose(result);
	fclose(thetaFile);
    fclose(likelihood_file);
	//释放空间
	free(theta);
	for (d = 0; d < corpus->num_docs; d++)
		free(var_gamma[d]);
	free(var_gamma);
	for (n = 0; n < max_length; n++)
		free(phi[n]);
	free(phi);
	free_lda_ss( ss,  model);
	free_lda_model(model);

	return (long double)(finish_EM-start_EM)/CLOCKS_PER_SEC/60;
}

void computeTheta( FILE* thetaFile, document* doc, double** phi, lda_model* model, double * theta)
{
	int n;

	for (n=0; n< model->num_topics; n++)
		theta[n] = 0;
	for (n = 0;  n<doc->length; n++)
	{
		int topicIndex = argmax(phi[n], model->num_topics);
		theta[  topicIndex  ] = theta[  topicIndex  ] + doc->counts[ n ];
	}

	for (n=0; n<model->num_topics; n++)
	{
		theta[n] = theta[n]/doc->total;
		fprintf(thetaFile, "%f\t", theta[n]);
	}
	fprintf(thetaFile, "\n");
	fflush(thetaFile);

}

/*
 * read settings.
 *
 */

void read_settings(char* filename)
{
    FILE* fileptr;
    char alpha_action[100];
    fileptr = fopen(filename, "r");
    fscanf(fileptr, "var max iter %d\n", &VAR_MAX_ITER);
    fscanf(fileptr, "var convergence %f\n", &VAR_CONVERGED);
    fscanf(fileptr, "em max iter %d\n", &EM_MAX_ITER);
    fscanf(fileptr, "em convergence %f\n", &EM_CONVERGED);
    fscanf(fileptr, "alpha %s", alpha_action);
    if (strcmp(alpha_action, "fixed")==0)
    {
	ESTIMATE_ALPHA = 0;
    }
    else
    {
	ESTIMATE_ALPHA = 1;
    }
    fclose(fileptr);
}

/*
 * inference only
 *
 */

void infer(char* model_root, char* save, corpus* corpus)
{
    FILE* fileptr;
	FILE* result;
	FILE* w_asgn_file;
    char filename[100];
	char filename1[200];
    int i, d, n;
    lda_model *model;
    double **var_gamma, likelihood, **phi;
    document* doc;

	/*double ***corpusPhi;
	corpusPhi = (double***)malloc(sizeof(double**)*(corpus->num_docs));
	for (i=0;i<corpus.num_docs;j++)
	{
	corpusPhi[i] = (double**)malloc(sizeof(double*)*)
	}*/
	sprintf(filename1, "%s/result-doc-assgn.dat", save);
	result = fopen(filename1, "w");

    model = load_lda_model(model_root);
    var_gamma = (double**)malloc(sizeof(double*)*(corpus->num_docs));
    for (i = 0; i < corpus->num_docs; i++)
		var_gamma[i] = (double*)malloc(sizeof(double)*model->num_topics);

	//int max_length = max_corpus_length(corpus);  

    sprintf(filename, "%s-lda-lhood.dat", save);
    fileptr = fopen(filename, "w");

    for (d = 0; d < corpus->num_docs; d++)
    {
		if (((d % 100) == 0) && (d>0)) printf("document %d\n",d);

		doc = &(corpus->docs[d]);
		phi = (double**) malloc(sizeof(double*) * doc->length);
		//phi = (double**) malloc(sizeof(double*) * max_length);
		for (n = 0; n < doc->length; n++)
		//for (n = 0; n < max_length; n++)
			phi[n] = (double*) malloc(sizeof(double) * model->num_topics);
		likelihood = lda_inference(doc, model, var_gamma[d], phi);

		fprintf(fileptr, "%5.5f\n", likelihood);

		//输出每一个文档的phi到文件result_%d_phi.dat中   另外每一个word对应的概率最大的topic保存在文件result-doc-assgn.dat中  一行对应一个文档
		sprintf(filename, "%s/result_%d_phi.dat", save,d);
		w_asgn_file = fopen(filename, "w");
		printf("final e step document %d\n",d);
		write_word_assignment(result,w_asgn_file, &(corpus->docs[d]), phi, model);
		fclose(w_asgn_file);
		for (n = 0; n < doc->length; n++)
			free(phi[n]);
		free(phi);
    }
    fclose(fileptr);
    sprintf(filename, "%s-gamma.dat", save);
    save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics);

	fclose(result);
	for (d = 0; d < corpus->num_docs; d++)
		free(var_gamma[d]);
	free(var_gamma); 

	free_lda_model(model);
}

3.4 lda-inference.c

其中包含变分参数求解相关的函数

#include "lda-inference.h"

/*
 * variational inference
 *
 */
int lisnan(double x) {
	return x != x;
}
double lda_inference(document* doc, lda_model* model, double* var_gamma, double** phi)
{
    double converged = 1;
    double phisum = 0, likelihood = 0;
    double likelihood_old = 0,  *oldphi=(double *)malloc(sizeof(double)*(model->num_topics));

    int k, n, var_iter;
   double *digamma_gam=(double *)malloc(sizeof(double)*(model->num_topics));

    // compute posterior dirichlet

    for (k = 0; k < model->num_topics; k++)
    {
	//初始化变分参数gamma=alpha + 当前文档中单词个数(不去重) N / 主题个数 k
        var_gamma[k] = model->alpha + (doc->total/((double) model->num_topics));
        //log gamma函数的一阶导数
	digamma_gam[k] = digamma(var_gamma[k]);
        //初始化变分参数phi=1/k
	for (n = 0; n < doc->length; n++)
            phi[n][k] = 1.0/model->num_topics;
    }
    var_iter = 0;
	//开始迭代
    while ((converged > VAR_CONVERGED) &&
           ((var_iter < VAR_MAX_ITER) || (VAR_MAX_ITER == -1)))
    {
	var_iter++;
	for (n = 0; n < doc->length; n++)
	{
            phisum = 0;
            for (k = 0; k < model->num_topics; k++)
            {
                oldphi[k] = phi[n][k];
		//更新变分参数 phi
		//就是论文中变分推断算法的式子 phi(n,i) = b(i,wn) * exp(digamma(gamma(i)))
		//这里因为有exp所以在log空间计算,算得的phi也是log space的
                phi[n][k] =
                    digamma_gam[k] +
                    model->log_prob_w[k][doc->words[n]];

                if (k > 0)
                    phisum = log_sum(phisum, phi[n][k]);//在log space对phi求和
                else
                    phisum = phi[n][k]; // note, phi is in log space
            }

            for (k = 0; k < model->num_topics; k++)
            {
		//归一化,使phi(n)和为1
                phi[n][k] = exp(phi[n][k] - phisum);
		//更新变分参数 gamma
                var_gamma[k] =
                    var_gamma[k] + doc->counts[n]*(phi[n][k] - oldphi[k]);
                // !!! a lot of extra digamma's here because of how we're computing it
                // !!! but its more automatically updated too.
                digamma_gam[k] = digamma(var_gamma[k]);
            }
        }

        likelihood = compute_likelihood(doc, model, phi, var_gamma);
        assert(!isnan(likelihood));
        converged = (likelihood_old - likelihood) / likelihood_old;
        likelihood_old = likelihood;

        // printf("[LDA INF] %8.5f %1.3e\n", likelihood, converged);
    }//迭代结束
    return(likelihood);
}

/*
 * compute likelihood bound
 *
 */
//按照论文附录(15)式计算L(gamma,phi;alpha,beta)
double
compute_likelihood(document* doc, lda_model* model, double** phi, double* var_gamma)
{
    double likelihood = 0, digsum = 0, var_gamma_sum = 0, *dig=(double *)malloc(sizeof(double)*(model->num_topics));
    int k, n;

    for (k = 0; k < model->num_topics; k++)
    {
	dig[k] = digamma(var_gamma[k]);
	var_gamma_sum += var_gamma[k];
    }
    digsum = digamma(var_gamma_sum);
	//论文(14)式中的Eq,第1个和第4个是合在一起再拆分算的,第2,3,5个是合在一起算的
	//Eq[logp(theta|alpha)]中的前两个部分 和 Eq[logq(theta)]中第一部分
    likelihood =
	log_gamma(model->alpha * model -> num_topics)
	- model -> num_topics * log_gamma(model->alpha)
	- (log_gamma(var_gamma_sum));

    for (k = 0; k < model->num_topics; k++)
    {
    //Eq[logp(theta|alpha)]中的第三个部分 和 Eq[logq(theta)]中剩余的
	likelihood +=
	    (model->alpha - 1)*(dig[k] - digsum) + log_gamma(var_gamma[k])
	    - (var_gamma[k] - 1)*(dig[k] - digsum);
	//Eq[logp(z|theta)] + Eq[logp(w|z,beta)] - Eq[logq(z)]
	for (n = 0; n < doc->length; n++)
	{
            if (phi[n][k] > 0)
            {
                likelihood += doc->counts[n]*
                    (phi[n][k]*((dig[k] - digsum) - log(phi[n][k])
                                + model->log_prob_w[k][doc->words[n]]));
            }
        }
    }
    return(likelihood);
}

3.5 lda-data.c

数据集读入

#include "lda-data.h"

corpus* read_data(char* data_filename)
{
    FILE *fileptr;
    int length, count, word, n, nd, nw;
    corpus* c;

    printf("reading data from %s\n", data_filename);
    c = malloc(sizeof(corpus));
    c->docs = 0;
    c->num_terms = 0;
    c->num_docs = 0;
    fileptr = fopen(data_filename, "r");
    nd = 0; nw = 0;
    while ((fscanf_s(fileptr, "%10d", &length) != EOF)) //读入每行数据的第一个数字,是文档的字典大小(文档中单词去重的个数)
    {
	//对于第nd个文档
	c->docs = (document*) realloc(c->docs, sizeof(document)*(nd+1)); //(数据类型*)realloc(要改变内存大小的指针名,新的大小)  新的大小一定要大于原来的大小,不然的话会导致数据丢失!
	c->docs[nd].length = length; //文档中出现过的单词的个数,也就是文档字典大小,是去重的
	c->docs[nd].total = 0; //文档中总单词个数,不去重,是对counts的求和。
	c->docs[nd].words = malloc(sizeof(int)*length); //文档中的word在文档集字典中的ID
	c->docs[nd].counts = malloc(sizeof(int)*length); //文档中word出现次数
	for (n = 0; n < length; n++)//读入每行数据剩下的数据,词频统计
	{
	    fscanf_s(fileptr, "%10d:%10d", &word, &count); //读入每个 [wordID:word出现次数]
	    word = word - OFFSET;
	    c->docs[nd].words[n] = word;
	    c->docs[nd].counts[n] = count;
	    c->docs[nd].total += count;
	    if (word >= nw) { nw = word + 1; } //nw记录文档集最大的那个word ID,也就是文档集字典中的单词个数
		//if (word >= nw) { nw = word; }

	}
		nd++;
    }
    fclose(fileptr);
    c->num_docs = nd;
    c->num_terms = nw;
    printf("number of docs    : %d\n", nd);
    printf("number of terms   : %d\n", nw);
    return(c);
}

int max_corpus_length(corpus* c)//输出数据集中单词数(去重后)最多的文档的单词数,这个length是去重后的长度
{
    int n, max = 0;
    for (n = 0; n < c->num_docs; n++)
	if (c->docs[n].length > max) max = c->docs[n].length;
    return(max);
}

3.6 lda-alpha.c

牛顿法计算模型参数alpha

#include "lda-alpha.h"
#include "lda-inference.h"
/*
 * objective function and its derivatives
 *
 */

double alhood(double a, double ss, int D, int K)
{ return(D * (log_gamma(K * a) - K * log_gamma(a)) + (a - 1) * ss); }

double d_alhood(double a, double ss, int D, int K)
{ return(D * (K * digamma(K * a) - K * digamma(a)) + ss); }

double d2_alhood(double a, int D, int K)
{ return(D * (K * K * trigamma(K * a) - K * trigamma(a))); }

/*
 * newtons method
 *
 */

double opt_alpha(double ss, int D, int K)
{
    double a, log_a, init_a = 100;
    double f, df, d2f;
    int iter = 0;

    log_a = log(init_a);
    do
    {
        iter++;
        a = exp(log_a);
        if (isnan(a))
        {
            init_a = init_a * 10;
            printf("warning : alpha is nan; new init = %5.5f\n", init_a);
            a = init_a;
            log_a = log(a);
        }
        f = alhood(a, ss, D, K); //附录A4.2中的L(a)
        df = d_alhood(a, ss, D, K); //L对a的一阶偏导
        d2f = d2_alhood(a, D, K); //二阶偏导
        log_a = log_a - df/(d2f * a + df);//迭代公式
        printf("alpha maximization : %5.5f   %5.5f\n", f, df);
    }
    while ((fabs(df) > NEWTON_THRESH) && (iter < MAX_ALPHA_ITER));
    return(exp(log_a));
}

还有cokus.c 和 utils.c 中是一些数学计算的函数。

版权声明:本文为博主原创文章,未经博主允许不得转载。

时间: 2024-10-15 12:35:59

LDA主题模型学习笔记5:C源码理解的相关文章

LDA主题模型学习笔记4:求解模型参数(M-step)

这一步,我们根据E-step得到的γ,?,最大化L(γ,?;α,β),得到α,β. 1,拉格朗日乘数法求解β 首先把L(γ,?;α,β)简化,只保留与β有关的部分.因为β是每一行存一个主题的词分布,所以每一行的和是1,存在等式约束∑Vj=1βij=1,所以是带等式约束的最大化问题,使用拉格朗日乘数法,可得到拉格朗日函数如下: 用拉格朗日函数对β求偏导,令偏导为0,可得: 这里的?dni指的是对第d个文档的变分参数?ni,也就是第n个单词在第i个主题的词分布中的概率,wjdn是第d个文档中第n个单

LDA主题模型学习笔记3.5:变分参数推导

现在来推导一下得到变分参数更新式的过程,这一部分是在论文的附录中,为避免陷入过多细节而影响整体理解,可以在刚开始学习LDA的时候先不关注求解细节.首先要把L写成关于γ,?函数.根据之前我们对L的定义: L(γ,?;α,β)=Eq[logp(θ,z,w|α,β)]?Eq[logq(θ,z)] (1) 再分别计算5个期望,可以得到如下式子: (2) 上式中5个期望的计算要用到如下式子,这个是作者在附录中推导出来的式子: 5个期望的计算: 接下来分别对?,γ 求偏导令导数为0,解出?,γ . 我们对(

memcached学习笔记——存储命令源码分析下篇

上一篇回顾:<memcached学习笔记——存储命令源码分析上篇>通过分析memcached的存储命令源码的过程,了解了memcached如何解析文本命令和mencached的内存管理机制. 本文是延续上一篇,继续分析存储命令的源码.接上一篇内存分配成功后,本文主要讲解:1.memcached存储方式:2.add和set命令的区别. memcached存储方式 哈希表(HashTable) 哈希表在实践中使用的非常广泛,例如编译器通常会维护的一个符号表来保存标记,很多高级语言中也显式的支持哈希

memcached学习笔记——存储命令源码分析上

原创文章,转载请标明,谢谢. 上一篇分析过memcached的连接模型,了解memcached是如何高效处理客户端连接,这一篇分析memcached源码中的process_update_command函数,探究memcached客户端的set命令,解读memcached是如何解析客户端文本命令,剖析memcached的内存管理,LRU算法是如何工作等等. 解析客户端文本命令 客户端向memcached server发出set操作,memcached server读取客户端的命令,客户端的连接状态

Hadoop学习笔记(10) ——搭建源码学习环境

Hadoop学习笔记(10) ——搭建源码学习环境 上一章中,我们对整个hadoop的目录及源码目录有了一个初步的了解,接下来计划深入学习一下这头神象作品了.但是看代码用什么,难不成gedit?,单步调试呢? 看程序不能调那多痛苦啊,想看跟踪一下变量,想看一下执行路径都难. 所以这里,我们得把这个调试环境搭建起来.Hadoop的主要代码是用java编写的,所以这里就选用eclipse作为环境. Hadoop目录下,本身就可以为作eclipse的一个工程来操作,但这里我不想,我想自己来建一个工程,

iScroll学习笔记2--浅读源码

iscroll的架子是这样的 (function (window, document, Math){ var utils = (function (){ var me = {}; // 扩展一些常用的工具方法为me的方法 return me; }()); function IScroll(el, options){ // 初始化一些属性和状态 } IScroll.prototype = { constructor: IScroll, // 主体方法都在这里 } }(window, documen

Python爬虫框架Scrapy 学习笔记 7------- scrapy.Item源码剖析

在前面的example中,我们知道定义一个Item类很简单,只要继承scrapy.Item,然后添加几个类型为scrapy.Field的对象作为类属性,就像下面这样 import scrapy class Product(scrapy.Item):     name = scrapy.Field()     price = scrapy.Field()     stock = scrapy.Field()     last_updated = scrapy.Field(serializer=st

Egret 学习笔记 h5牛牛源码 h5牛牛源码搭建教程

1.纹理集实际上就是将一些零碎的小图放到一张大图当中.游戏中也经常使用到纹理集.使用纹理集的好处很多,我们通过将大量的图片拼合为一张图片从而减少网络请求,原先加载数次的图片资源现在加载一次即可.同时,在引擎渲染的时候也会减少IO读取,从而提高h5牛牛源码性能.h5牛牛源码Q 2171793408     http://wowotouba.com/h52.只要发生事件,Flash就会调度事件对象.如果事件目标不在显示列表中,则Flash Player或AIR将事件对象直接调度到事件目标.例如,Fl

《一个操作系统的实现》学习笔记(一) bochs源码安装及配置

前言:本机环境ubuntu 14.04 bochs 2.4.5 一.下载 官网 http://bochs.sourceforge.net/ 二.安装 1.将下载好的压缩包解压并进入该目录 tar vxzf bochs-2.4.5.tar.gz cd bochs-2.4.5 2.安装编译程序依赖的包 sudo apt-get install build-essential 3.检测环境,打开调试功能的开关 ./configure --enable-debugger --enable-disasm