nlp领域里,语义理解仍然是难题!
给你一篇文章或者一个句子,人们在理解这些句子时,头脑中会进行上下文的搜索和知识联想。通常情况下,人在理解语义时头脑中会搜寻与之相关的知识。知识图谱的创始人人为,构成这个世界的是实体,而不是字符串,这从根本上改变了过去搜索的体系。语义理解其实是基于知识,关联,概念。人们在解答问题时,往往会旁征博引或者讲述语义相似的人们容易理解的知识,这是语义理解的过程。这种机制完全不同于人对图像或者语音的认识。CNN在图像或者语音领域取得成果是不足为奇的,因为生物学家目前已经对人脑对于图像的识别过程的神经元机制非常熟悉,但是对于人脑如何理解文字的神经元机制却知之甚少,所以导致了目前nlp语义理解方面进展非常缓慢。很多人尝试CNN引入nlp效果不佳,发现多层的CNN和单层的CNN几乎没有差别,原因得从人脑的神经元机制说起。生搬硬套是必然失败的!深度学习的本质并不是神经元层数多这么简单,能够从最基本的特征,逐层抽取出高阶特征,最后进行分类,这是深度学习取得成功的关键。
有一部分人质疑word2vector不是深度学习,说层数太浅达不到深度的级别,这是一种误解。word2vector是地地道道的深度学习,能够抽取出词的高阶特征。他的成功,关键是基于他的核心思想:相同语境出现的词语义相近。从第一层one-hot到embedding层,就是高阶特征抽取的过程。前面说过,层数多了并不会带来效果的提升,原因是因为人脑对nlp的神经元机制还不清楚,不能照搬CNN。词embedding已经是高阶特征了,文字比图像要复杂很多,即使能够层层抽取,目前深度学习在nlp中的引入,方向可能是错误的。必须深入研究人脑对文字理解的神经元机制,弄清楚生物学模型,然后才能从中抽象出数学模型,就像CNN一样,否则nlp不会有长足的进展。
目前来看,深度学习在nlp中的应用,仅限于上下文和词,句子向量。计算一下句子相似度,聚类之类的,要想真正让机器理解文字,还达不到。Google提出的知识图谱是一种变革,nlp是一个完整的生态圈,从最底层的存储,GDB三元组(entry,relation,entry)存储,到上层的语义表示(这个阶段可以借助深度学习),目前知网也在做这些事情。语义表示是深度学习在nlp应用中的重中之重。本文主要探讨word2vector。关于他的核心思想前面已经提到了,这是道的层面,具体推导,比如CBOW ,skip-gram的优化:negative sampleing和哈夫曼树softmax,这是术的层面。现在上传用tensorflow实现的word2vector代码:
data-helper.py:
import collections import os import random import zipfile import numpy as np import urllib.request as request import tensorflow as tf url = ‘http://mattmahoney.net/dc/‘ def maybe_download(filename,expected_bytes): if not os.path.exists(filename): filename,_ = request.urlretrieve(url+filename,filename) statinfo = os.stat(filename) if statinfo.st_size == expected_bytes: print(‘Found and verified‘,filename) else: print(statinfo.st_size) raise Exception(‘Failed to verify‘ + filename + ‘.Can you get to it with a browser?‘) return filename def read_data(filename): with zipfile.ZipFile(filename) as f: data = tf.compat.as_str(f.read(f.namelist()[0])).split() return data vocabulary_size = 50000 def build_dataset(words): count = [[‘UNK‘,-1]] count.extend(collections.Counter(words).most_common(vocabulary_size - 1)) dictionary = dict(zip(list(zip(*count))[0],range(len(list(zip(*count))[0])))) data = list() un_count = 0 for word in words: if word in dictionary: index = dictionary[word] else: index = 0 un_count += 1 data.append(index) count[0][1] = un_count reverse_dictionary = dict(zip(dictionary.values(),dictionary.keys())) return data,reverse_dictionary,dictionary,count data_index = 0 def generate_batch(data,batch_size,num_skips,skip_window): filename = maybe_download(‘text8.zip‘, 31344016) words = read_data(filename) global data_index assert num_skips <= 2 * skip_window assert batch_size % num_skips == 0 span = 2 * skip_window + 1 batch = np.ndarray(shape=[batch_size],dtype=np.int32) labels = np.ndarray(shape=[batch_size,1],dtype=np.int32) buffer = collections.deque(maxlen=span) #初始化 for i in range(span): buffer.append(data[data_index]) data_index = (data_index + 1) % len(data) #移动窗口,获取批量数据 for i in range(batch_size // num_skips): target = skip_window avoid_target = [skip_window] for j in range(num_skips): while target in avoid_target: target = np.random.randint(0,span - 1) avoid_target.append(target) batch[i * num_skips + j] = buffer[skip_window] labels[i * num_skips + j,0] = buffer[target] buffer.append(data[data_index]) data_index = (data_index + 1) % len(data) return batch,labels
w2vModel.py
import tensorflow as tf import w2v.data_helper as da import numpy as np import math #filename = da.maybe_download(‘text8.zip‘, 31344016) words = da.read_data("text8.zip") assert words is not None data,reverse_dictionary,dictionary,count = da.build_dataset(words) class config(object): batch_size = 128 embedding_size = 128 skip_window = 1 num_skips = 2 valid_size = 16 valid_window = 100 valid_examples = np.random.choice(valid_window, valid_size, replace=False) num_sampled = 64 vocabulary_size = 50000 num_steps = 10001 class w2vModel(object): def __init__(self,config): self.train_inputs = train_inputs = tf.placeholder(tf.int32, shape=[config.batch_size]) self.train_labels = train_labels = tf.placeholder(tf.int32, shape=[config.batch_size, 1]) self.valid_dataset = valid_dataset = tf.constant(config.valid_examples, dtype=tf.int32) with tf.device(‘/cpu:0‘): embeddings = tf.Variable( tf.random_uniform(shape=[config.vocabulary_size, config.embedding_size], minval=-1.0, maxval=1.0)) embed = tf.nn.embedding_lookup(embeddings, train_inputs) nce_weights = tf.Variable( tf.truncated_normal([config.vocabulary_size, config.embedding_size], stddev=1.0 / math.sqrt(config.embedding_size))) nce_bias = tf.Variable(tf.zeros([config.vocabulary_size])) loss = tf.reduce_mean( tf.nn.nce_loss(weights=nce_weights, biases=nce_bias, labels=train_labels, inputs=embed, num_sampled=config.num_sampled, num_classes=config.vocabulary_size)) optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss) norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True)) normalized_embeddings = embeddings / norm valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset) similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True) tf.add_to_collection("embedding",embeddings) self.saver = saver = tf.train.Saver(tf.global_variables())
train.py:
import tensorflow as tf import w2v.w2vmodel as model import w2v.data_helper as da config = model.config() with tf.Graph().as_default() as g: Model = model.w2vModel(config) with tf.Session(graph=g) as session: tf.global_variables_initializer().run() print("initialized") average_loss = 0.0 for step in range(config.num_steps): batch_inputs,batch_labels = da.generate_batch(model.data,config.batch_size,config.num_skips,config.skip_window) feed_dict = {Model.train_inputs:batch_inputs,Model.train_labels:batch_labels} _,loss_val = session.run([Model.optimizer,Model.loss],feed_dict=feed_dict) average_loss += loss_val if step % 2000 == 0: if step > 0: average_loss /= 2000 print("Average loss at step",step,":",average_loss) average_loss = 0 if step % 10000 == 0: sim = Model.similarity.eval() for i in range(config.valid_size): valid_word = model.reverse_dictionary[config.valid_examples[i]] top_k = 8 nearest = (-sim[i,:]).argsort()[1:top_k+1] log_str = "Nearest to %s:" % valid_word for k in range(top_k): close_word = model.reverse_dictionary[nearest[k]] log_str = "%s %s," % (log_str,close_word) print(log_str) Model.saver.save(session, "E:/word2vector/models/model.ckpt") #final_embeddings = model.normalized_embeddings.eval()
代码实现比较简单,先对样本统计,然后降序排列,在得到dictionary{词:索引},接下把样本中的词转换成索引,进行训练。词向量就是神经元参数embedding,在预测时,只需要拿出embedding和dictionary,就可以得到词向量,比biLSTM和siamese lstm简单多了!但是,他在语义理解上有致命的缺点:对于词典中没出现的词的语义表示用0代替,明显是不妥当的,能力有限!所以现在国内有少数的学者研究把神经概率语义表示和符号语义表示结合起来,难度不小!
期待nlp语义理解出现变革……