Tensorflow 处理libsvm格式数据生成TFRecord (parse libsvm data to TFRecord)

#写libsvm格式
数据 write libsvm

?
?

#!/usr/bin/env python

#coding=gbk

# ==============================================================================

# \file gen-records.py

# \author chenghuige

# \date 2016-08-12 11:52:01.952044

# \Description

# ==============================================================================

?
?

?

from
__future__
import absolute_import

from
__future__
import division

#from __future__ import print_function

?
?

import
sys,os

?
?

import
tensorflow
as
tf

import
numpy
as
np

?
?

flags = tf.app.flags

FLAGS = flags.FLAGS

?
?

_float_feature = lambda
v: tf.train.Feature(float_list=tf.train.FloatList(value=v))

?
?

_int_feature = lambda
v: tf.train.Feature(int64_list=tf.train.Int64List(value=v))

?
?

#how to store global info, using sequence example?

def main(argv):

writer = tf.python_io.TFRecordWriter(argv[2])

for line in open(argv[1]):

l = line.rstrip().split()

label = int(l[0])

?

start = 1

num_features = 0

if
‘:‘
not
in l[1]:

num_features = int(l[1])

start += 1

?

indexes = []

values = []

?

for item in l[start:]:

index,value = item.split(‘:‘)

indexes.append(int(index))

values.append(float(value))

?

example = tf.train.Example(features=tf.train.Features(feature={

‘label‘: _int_feature([label]),

‘num_features‘: _int_feature

‘index‘: _int_feature(indexes),

‘value‘: _float_feature(values)

}))

writer.write(example.SerializeToString())

?
?

if __name__ == ‘__main__‘:

tf.app.run()

?
?

?
?

#读libsvm格式
数据 read libsvm

?
?

#!/usr/bin/env python

#coding=gbk

# ==============================================================================

# \file read-records.py

# \author chenghuige

# \date 2016-07-19 17:09:07.466651

# \Description

# ==============================================================================

?
?

#@TODO treat comment as sparse input ?

?

from
__future__
import absolute_import

from
__future__
import division

#from __future__ import print_function

?
?

import
sys, os, time

import
tensorflow
as
tf

?
?

import
numpy
as
np

?
?

flags = tf.app.flags

FLAGS = flags.FLAGS

?
?

flags.DEFINE_integer(‘batch_size‘, 5, ‘Batch size.‘)

flags.DEFINE_integer(‘num_epochs‘, 10, ‘Number of epochs to run trainer.‘)

flags.DEFINE_integer(‘num_preprocess_threads‘, 12, ‘‘)

?
?

MIN_AFTER_DEQUEUE = 10000

?
?

def read(filename_queue):

reader = tf.TFRecordReader()

_, serialized_example = reader.read(filename_queue)

return serialized_example

?
?

def decode(batch_serialized_examples):

features = tf.parse_example(

batch_serialized_examples,

features={

‘label‘ : tf.FixedLenFeature([], tf.int64),

‘index‘ : tf.VarLenFeature(tf.int64),

‘value‘ : tf.VarLenFeature(tf.float32),

})

?
?

label = features[‘label‘]

index = features[‘index‘]

value = features[‘value‘]

?
?

return label, index, value

?
?

def batch_inputs(files, batch_size, num_epochs = None, num_preprocess_threads=1):

"""Reads input data num_epochs times.

"""

if
not
num_epochs: num_epochs = None

?
?

with
tf.name_scope(‘input‘):

filename_queue = tf.train.string_input_producer(

files, num_epochs=num_epochs)

?
?

serialized_example = read(filename_queue)

batch_serialized_examples = tf.train.shuffle_batch(

[serialized_example],

batch_size=batch_size,

num_threads=num_preprocess_threads,

capacity=MIN_AFTER_DEQUEUE + (num_preprocess_threads + 1) * batch_size,

# Ensures a minimum amount of shuffling of examples.

min_after_dequeue=MIN_AFTER_DEQUEUE)

?
?

return decode(batch_serialized_examples)

?
?

def read_records():

# Tell TensorFlow that the model will be built into the default Graph.

with
tf.Graph().as_default():

# Input images and labels.

tf_record_pattern = sys.argv[1]

data_files = tf.gfile.Glob(tf_record_pattern)

label, index, value = batch_inputs(data_files,

batch_size=FLAGS.batch_size,

num_epochs=FLAGS.num_epochs,

num_preprocess_threads=FLAGS.num_preprocess_threads)

?
?

# The op for initializing the variables.

init_op = tf.group(tf.initialize_all_variables(),

tf.initialize_local_variables())

?
?

# Create a session for running operations in the Graph.

#sess = tf.Session()

sess = tf.InteractiveSession()

#init_op = tf.initialize_all_variables()

#self.session.run(init)

?
?

# Initialize the variables (the trained variables and the

# epoch counter).

sess.run(init_op)

?
?

# Start input enqueue threads.

coord = tf.train.Coordinator()

threads = tf.train.start_queue_runners(sess=sess, coord=coord)

?
?

try:

step = 0

while
not coord.should_stop():

start_time = time.time()

label_, index_, value_ = sess.run([label, index, value])

print label_

print index_

print value_

print index_[0]

print index_[1]

print index_[2]

duration = time.time() - start_time

step += 1

except
tf.errors.OutOfRangeError:

print(‘Done training for %d epochs, %d steps.‘ % (FLAGS.num_epochs, step))

finally:

# When done, ask the threads to stop.

coord.request_stop()

?
?

# Wait for threads to finish.

coord.join(threads)

sess.close()

?
?

?
?

def main(_):

read_records()

?
?

?
?

if __name__ == ‘__main__‘:

tf.app.run()

?
?

#文本分类 text classification

https://github.com/chenghuige/tensorflow-example

?
?

using TfRecord only need small modification, like below, I will update the code in github soon.

?
?

class
SparseClassificationTrainer(object):

"""General framework for Sparse BinaryClassificationTrainer

?
?

Sparse BinaryClassfiction will use sparse embedding look up trick

see https://github.com/tensorflow/tensorflow/issues/342

"""

def __init__(self, dataset = None, num_features = 0):

if
dataset
is
not
None
and
type(dataset) != TfDataSet:

self.labels = dataset.labels

self.features = dataset.features

self.num_features = dataset.num_features

self.num_classes = dataset.num_classes

else:

self.features = SparseFeatures()

self.num_features = num_features

self.num_classes = None

?
?

self.index_only = False

self.total_features = self.num_features

?
?

if
type(dataset) != TfDataSet:

self.sp_indices = tf.placeholder(tf.int64, name = ‘sp_indices‘)

self.sp_shape = tf.placeholder(tf.int64, name = ‘sp_shape‘)

self.sp_ids_val = tf.placeholder(tf.int64, name = ‘sp_ids_val‘)

self.sp_weights_val = tf.placeholder(tf.float32, name = ‘sp_weights_val‘)

self.sp_ids = tf.SparseTensor(self.sp_indices, self.sp_ids_val, self.sp_shape)

self.sp_weights = tf.SparseTensor(self.sp_indices, self.sp_weights_val, self.sp_shape)

?
?

self.X = (self.sp_ids, self.sp_weights)

self.Y = tf.placeholder(tf.int32) #same as batch size

else:

self.X = (dataset.index, dataset.value)

self.Y = dataset.label

?

self.type = ‘sparse‘

?
?

?
?

?
?

MIN_AFTER_DEQUEUE = 10000

def read(filename_queue):

reader = tf.TFRecordReader()

_, serialized_example = reader.read(filename_queue)

return serialized_example

?
?

def decode(batch_serialized_examples):

features = tf.parse_example(

batch_serialized_examples,

features={

‘label‘ : tf.FixedLenFeature([], tf.int64),

‘index‘ : tf.VarLenFeature(tf.int64),

‘value‘ : tf.VarLenFeature(tf.float32),

})

?
?

label = features[‘label‘]

index = features[‘index‘]

value = features[‘value‘]

?
?

return label, index, value

?
?

def batch_inputs(files, batch_size, num_epochs=None, num_preprocess_threads=12):

if
not
num_epochs: num_epochs = None

?
?

with tf.name_scope(‘input‘):

filename_queue = tf.train.string_input_producer(

files, num_epochs=num_epochs)

?
?

serialized_example = read(filename_queue)

batch_serialized_examples = tf.train.shuffle_batch(

[serialized_example],

batch_size=batch_size,

num_threads=num_preprocess_threads,

capacity=MIN_AFTER_DEQUEUE + (num_preprocess_threads + 1) * batch_size,

# Ensures a minimum amount of shuffling of examples.

min_after_dequeue=MIN_AFTER_DEQUEUE)

?
?

return decode(batch_serialized_examples

class
TfDataSet(object):

def __init__(self, data_files):

self.data_files = data_files

#@TODO now only deal sparse input

self.features = SparseFeatures()

self.label = None

?
?

def build_read_graph(self, batch_size):

tf_record_pattern = self.data_files

data_files = tf.gfile.Glob(tf_record_pattern)

self.label, self.index, self.value = batch_inputs(data_files, batch_size)

?
?

?
?

?
?

def next_batch(self, sess):

label, index, value = sess.run([self.label, self.index, self.value])

?
?

trX = (index, value)

trY = label

?
?

return trX, trY

?
?

?
?

?
?

trainset = melt.load_dataset(trainset_file, is_record=FLAGS.is_record)

if FLAGS.is_record:

trainset.build_read_graph(batch_size)

?

step = 0

while
not coord.should_stop():

#self.trainer.X, self.trainer.Y = trainset.next_batch(self.session)

_, cost_, accuracy_ = self.session.run([self.train_op, self.cost, self.accuracy])

if step % 100 == 0:

print
‘step:‘, step, ‘train [email protected]:‘, accuracy_,‘cost:‘, cost_

if step % 1000 == 0:

pass

step +=

时间: 2024-10-20 21:17:33

Tensorflow 处理libsvm格式数据生成TFRecord (parse libsvm data to TFRecord)的相关文章

ini格式数据生成与解析详解

ini格式数据生成与解析详解 1.ini格式数据长啥样? 2.ini格式数据简介 INI文件的命名来源,是取自英文"初始(Initial)"的首字缩写,正与它的用途--初始化程序相应. 有时候,INI文件也会以不同的扩展名,如".cfg".".conf"(在Linux系统中也常用到).或是".txt"代替(从辨识度的角度,不建议使用). 3.ini格式数据适用场景 INI文件是一个无固定标准格式的配置文件.它以简单的文字与简单

记录几种有关libsvm格式数据的list和dict用法

1 # list元素求和 2 sum = reduce(lambda x,y: x+y, mylist) 1 # 比较两个 lists 的元素是否完全一致 2 if all(x==y for x, y in zip(X, Y)) 3 do something 1 # 按照 key 排列字典的元素,以libsvm格式输出 2 sorted(dd.keys()) 3 sorted_dict = [(key, dd[key]) for key in sorted(dd.keys())] 4 # 以li

pu0quqbcSerlet生成jS0N格式数据并用

榆已 pu0quqbcSerlet生成jS0N格式数据并用

Android使用DOM生成和输出XML格式数据

Android使用DOM生成和输出XML格式数据 本文主要简单讲解如何使用DOM生成和输出XML数据. 1. 生成和输出XML数据 代码及注释如下: try { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = factory.newDocumentBuilder(); //创建一个新的Document对象,并非获取 Document xmlDocume

Java Servlet生成Json格式数据

Java Servlet生成Json格式数据 分类: Web JAVA2013-09-17 14:38 4805人阅读 评论(1) 收藏 举报 在Servlet中覆写doGet方法,是用JSONStringer 类: [java] view plaincopyprint? protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { // 

iOS开发之JSON格式数据的生成与解析

本文将从四个方面对IOS开发中JSON格式数据的生成与解析进行讲解: 一.JSON是什么? 二.我们为什么要用JSON格式的数据? 三.如何生成JSON格式的数据? 四.如何解析JSON格式的数据?  JSON格式取代了xml给网络传输带来了很大的便利,但是却没有了xml的一目了然,尤其是json数据很长的时候,我们会陷入繁琐复杂的数据节点查找中.这时我们就需要一款在线校验工具 BeJson. 一.JSON是什么? JSON(JavaScript Object Notation) 是一种轻量级的

fastJson java后台转换json格式数据

什么事JSON? JSON(JavaScript Object Notation) 是一种轻量级的数据交换格式. 易于人阅读和编写.同时也易于机器解析和生成. 它基于JavaScript Programming Language, Standard ECMA-262 3rd Edition - December 1999的一个子集. JSON采用完全独立于语言的文本格式,但是也使用了类似于C语言家族的习惯(包括C, C++, C#, Java, JavaScript, Perl, Python等

jquery解析json格式数据的方法(对象、字符串)

相关函数 函数 描述 JSON.parse() 用于将一个 JSON 字符串转换为 JavaScript 对象. JSON.stringify() 用于将 JavaScript 值转换为 JSON 字符串. //data为字符串类型 则要将字符串类型转换成json数据类型 var jsondatas=eval("("+data+")"); 本文实例讲述了jquery解析json格式数据的方法.分享给大家供大家参考,具体如下: json数据是我们常用的一种小型的数据实

微信扫描支付订单数据生成类[模式二]

* *必要条件: *1.微信公共号 *2.微信公共号APPID *3.微信公共号 [微信支付] 绑定的 商户号MCH_ID *4. 微信公共号 商户支付密钥 * * 支付流程: * 1.调用统一下单,取得code_url,生成二维码 * 2.用户扫描二维码,进行支付 * 3.支付完成之后,微信服务器会通知支付成功 * 4.在支付成功通知中需要查单确认是否真正支付成功 业务流程说明: (1)商户后台系统根据用户选购的商品生成订单. (2)用户确认支付后调用微信支付[统一下单API]生成预支付交易: