当前位置：移动技术网 > IT编程>脚本编程>Python > tensorflow 变长序列存储实例

tensorflow 变长序列存储实例

2020年03月09日 | 移动技术网IT编程 | 我要评论

复兴1910,骑牛难下,上锁的箱子任务怎么做

问题

问题是这样的，要把一个数组存到tfrecord中，然后读取

a = np.array([[0, 54, 91, 153, 177,1],
  [0, 50, 89, 147, 196],
  [0, 38, 79, 157],
  [0, 49, 89, 147, 177],
  [0, 32, 73, 145]])

图片我都存储了，这个不还是小意思，一顿操作

import tensorflow as tf
import numpy as np

def _int64_feature(value):
 if not isinstance(value,list):
 value = [value]
 return tf.train.feature(int64_list=tf.train.int64list(value=value))

# write an array to tfrecord.
# a is an array which contains lists of variant length.
a = np.array([[0, 54, 91, 153, 177,1],
  [0, 50, 89, 147, 196],
  [0, 38, 79, 157],
  [0, 49, 89, 147, 177],
  [0, 32, 73, 145]])

writer = tf.python_io.tfrecordwriter('file')

for i in range(a.shape[0]):
 feature = {'i' : _int64_feature(i), 
  'data': _int64_feature(a[i])}

 # create an example protocol buffer
 example = tf.train.example(features=tf.train.features(feature=feature))

 # serialize to string and write on the file
 writer.write(example.serializetostring())

writer.close()


# use dataset api to read the tfrecord file.
filenames = ["file"]
dataset = tf.data.tfrecorddataset(filenames)
def _parse_function(example_proto):
 keys_to_features = {'i':tf.fixedlenfeature([],tf.int64),
   'data':tf.fixedlenfeature([],tf.int64)}
 parsed_features = tf.parse_single_example(example_proto, keys_to_features)
 return parsed_features['i'], parsed_features['data']

dataset = dataset.map(_parse_function)
dataset = dataset.shuffle(buffer_size=1)
dataset = dataset.repeat() 
dataset = dataset.batch(1)
iterator = dataset.make_one_shot_iterator()
i, data = iterator.get_next()
with tf.session() as sess:
 print(sess.run([i, data]))
 print(sess.run([i, data]))
 print(sess.run([i, data]))

报了奇怪的错误，name: <unknown>, key: data, index: 0. number of int64 values != expected. values size: 6 but output shape: [] 这意思是我数据长度为6，但是读出来的是[]，这到底是哪里错了，我先把读取的代码注释掉，看看tfreocrd有没有写成功，发现写成功了，这就表明是读取的问题，我怀疑是因为每次写入的长度是变化的原因，但是又有觉得不是，因为图片的尺寸都是不同的，我还是可以读取的，百思不得其解的时候我发现存储图片的时候是img.tobytes(),我把一个数组转换成了bytes，而且用的也是bytes存储，是不是tensorflow会把这个bytes当成一个元素，虽然每个图片的size不同，但是tobytes后tensorflow都会当成一个元素，然后读取的时候再根据(height,width,channel)来解析成图片。

我来试试不存为int64，而是存为bytes。又是一顿厉害的操作

数据转为bytes

# -*- coding: utf-8 -*-

import tensorflow as tf
import numpy as np

def _byte_feature(value):
 return tf.train.feature(bytes_list=tf.train.byteslist(value=[value]))

def _int64_feature(value):
 if not isinstance(value,list):
 value = [value]
 return tf.train.feature(int64_list=tf.train.int64list(value=value))
# write an array to tfrecord.
# a is an array which contains lists of variant length.
a = np.array([[0, 54, 91, 153, 177,1],
  [0, 50, 89, 147, 196],
  [0, 38, 79, 157],
  [0, 49, 89, 147, 177],
  [0, 32, 73, 145]])

writer = tf.python_io.tfrecordwriter('file')

for i in range(a.shape[0]): # i = 0 ~ 4
 feature = {'len' : _int64_feature(len(a[i])), # 将无意义的i改成len，为了后面还原
  'data': _byte_feature(np.array(a[i]).tobytes())} # 我也不知道为什么a[i]是list（后面就知道了），要存bytes需要numpy一下

 # create an example protocol buffer
 example = tf.train.example(features=tf.train.features(feature=feature))

 # serialize to string and write on the file
 writer.write(example.serializetostring())

writer.close()

#
# use dataset api to read the tfrecord file.
filenames = ["file"]
dataset = tf.data.tfrecorddataset(filenames)
def _parse_function(example_proto):
 keys_to_features = {'len':tf.fixedlenfeature([],tf.int64),
   'data':tf.fixedlenfeature([],tf.string)} # 改成string
 parsed_features = tf.parse_single_example(example_proto, keys_to_features)
 return parsed_features['len'], parsed_features['data']

dataset = dataset.map(_parse_function)
dataset = dataset.shuffle(buffer_size=1)
dataset = dataset.repeat() 
dataset = dataset.batch(1)
iterator = dataset.make_one_shot_iterator()
i, data = iterator.get_next()
with tf.session() as sess:
 print(sess.run([i, data]))
 print(sess.run([i, data]))
 print(sess.run([i, data]))


"""
[array([6], dtype=int64), array([b'\x00\x00\x00\x006\x00\x00\x00[\x00\x00\x00\x99\x00\x00\x00\xb1\x00\x00\x00\x01\x00\x00\x00'],
 dtype=object)]
[array([5], dtype=int64), array([b'\x00\x00\x00\x002\x00\x00\x00y\x00\x00\x00\x93\x00\x00\x00\xc4\x00\x00\x00'],
 dtype=object)]
[array([4], dtype=int64), array([b'\x00\x00\x00\x00&\x00\x00\x00o\x00\x00\x00\x9d\x00\x00\x00'],
 dtype=object)]
"""

bytes数据解码

如愿的输出来了，但是这个bytes我该如何解码呢

方法一，我们自己解析

 a,b= sess.run([i,data])
 c = np.frombuffer(b[0],dtype=np.int,count=a[0])

方法二使用tensorflow的解析函数

def _parse_function(example_proto):
 keys_to_features = {'len':tf.fixedlenfeature([],tf.int64),
   'data':tf.fixedlenfeature([],tf.string)} # 改成string
 parsed_features = tf.parse_single_example(example_proto, keys_to_features)
 dat = tf.decode_raw(parsed_features['data'],tf.int64) # 用的是这个解析函数，我们使用int64的格式存储的，解析的时候也是转换为int64
 return parsed_features['len'], dat
"""
[array([6]), array([[ 0, 54, 91, 153, 177, 1]])]
[array([5]), array([[ 0, 50, 89, 147, 196]])]
[array([4]), array([[ 0, 38, 79, 157]])]
"""

可以看到是二维数组，这是因为我们使用的是batch输出，虽然我们的bathc_size=1，但是还是会以二维list的格式输出。我手贱再来修改点东西，

def _parse_function(example_proto):
 keys_to_features = {'len':tf.fixedlenfeature([1],tf.int64),
   'data':tf.fixedlenfeature([1],tf.string)} 
 parsed_features = tf.parse_single_example(example_proto, keys_to_features)
 dat = tf.decode_raw(parsed_features['data'],tf.int64)
 return parsed_features['len'], dat

"""
[array([[6]]), array([[[ 0, 54, 91, 153, 177, 1]]])]
[array([[5]]), array([[[ 0, 50, 89, 147, 196]]])]
[array([[4]]), array([[[ 0, 38, 79, 157]]])]
"""

呦呵，又变成3维的了，让他报个错试试

def _parse_function(example_proto):
 keys_to_features = {'len':tf.fixedlenfeature([2],tf.int64), # 1 修改为 2
   'data':tf.fixedlenfeature([1],tf.string)} # 改成string
 parsed_features = tf.parse_single_example(example_proto, keys_to_features)
 return parsed_features['len'], parsed_features['data']

"""
invalidargumenterror: key: len. can't parse serialized example.
 [[node: parsesingleexample/parsesingleexample = parsesingleexample[tdense=[dt_string, dt_int64], dense_keys=["data", "len"], dense_shapes=[[1], [2]], num_sparse=0, sparse_keys=[], sparse_types=[]](arg0, parsesingleexample/const, parsesingleexample/const_1)]]
 [[node: iteratorgetnext_22 = iteratorgetnext[output_shapes=[[?,2], [?,1]], output_types=[dt_int64, dt_string], _device="/job:localhost/replica:0/task:0/device:cpu:0"](oneshotiterator_22)]]
"""

可以看到dense_keys=["data", "len"], dense_shapes=[[1], [2]],，tf.fixedlenfeature是读取固定长度的数据，我猜测[]的意思就是读取全部数据，[1]就是读取一个数据，每个数据可能包含多个数据，形如[[1，2],[3，3，4],[2]....]，哈哈这都是我瞎猜的，做我女朋友好不好。

tensorflow 变长数组存储

反正是可以读取了。但是如果是自己定义的变长数组，每次都要自己解析，这样很麻烦（我瞎遍的），所以tensorflow就定义了变长数组的解析方法tf.varlenfeature，我们就不需要把边长数组变为bytes再解析了，又是一顿操作

import tensorflow as tf
import numpy as np

def _int64_feature(value):
 if not isinstance(value,list):
 value = [value]
 return tf.train.feature(int64_list=tf.train.int64list(value=value))

# write an array to tfrecord.
# a is an array which contains lists of variant length.
a = np.array([[0, 54, 91, 153, 177,1],
  [0, 50, 89, 147, 196],
  [0, 38, 79, 157],
  [0, 49, 89, 147, 177],
  [0, 32, 73, 145]])

writer = tf.python_io.tfrecordwriter('file')

for i in range(a.shape[0]): # i = 0 ~ 4
 feature = {'i' : _int64_feature(i), 
  'data': _int64_feature(a[i])}

 # create an example protocol buffer
 example = tf.train.example(features=tf.train.features(feature=feature))

 # serialize to string and write on the file
 writer.write(example.serializetostring())

writer.close()


# use dataset api to read the tfrecord file.
filenames = ["file"]
dataset = tf.data.tfrecorddataset(filenames)
def _parse_function(example_proto):
 keys_to_features = {'i':tf.fixedlenfeature([],tf.int64),
   'data':tf.varlenfeature(tf.int64)}
 parsed_features = tf.parse_single_example(example_proto, keys_to_features)
 return parsed_features['i'], tf.sparse_tensor_to_dense(parsed_features['data'])

dataset = dataset.map(_parse_function)
dataset = dataset.shuffle(buffer_size=1)
dataset = dataset.repeat() 
dataset = dataset.batch(1)
iterator = dataset.make_one_shot_iterator()
i, data = iterator.get_next()
with tf.session() as sess:
 print(sess.run([i, data]))
 print(sess.run([i, data]))
 print(sess.run([i, data]))

"""
[array([0], dtype=int64), array([[ 0, 54, 91, 153, 177, 1]], dtype=int64)]
[array([1], dtype=int64), array([[ 0, 50, 89, 147, 196]], dtype=int64)]
[array([2], dtype=int64), array([[ 0, 38, 79, 157]], dtype=int64)]
"""

batch输出

输出还是数组，哈哈哈。再来一波操作

dataset = dataset.batch(2)
"""
cannot batch tensors with different shapes in component 1. first element had shape [6] and element 1 had shape [5].
"""

这是因为一个batch中数据的shape必须是一致的，第一个元素长度为6，第二个元素长度为5，就会报错。办法就是补成一样的长度，在这之前先测试点别的

a = np.array([[0, 54, 91, 153, 177,1],
  [0, 50, 89, 147, 196],
  [0, 38, 79, 157],
  [0, 49, 89, 147, 177],
  [0, 32, 73, 145]])


for i in range(a.shape[0]):
 print(type(a[i]))

"""
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
"""

可以发现长度不一的array每一个数据是list（一开始我以为是object）。然后补齐

a = np.array([[0, 54, 91, 153, 177,1],
  [0, 50, 89, 147, 196,0],
  [0, 38, 79, 157,0,0],
  [0, 49, 89, 147, 177,0],
  [0, 32, 73, 145,0,0]])


for i in range(a.shape[0]):
 print(type(a[i]))

"""
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
"""

返回的是numpy。为什么要做这件事呢？

def _int64_feature(value):
 if not isinstance(value,list):
 value = [value]
 return tf.train.feature(int64_list=tf.train.int64list(value=value))

tensorflow要求我们输入的是list或者直接是numpy.ndarry，如果是list中包含numpy.ndarry [numpy.ndarry]就会报错。上面的那个数组时边长的，返回的时list，没有什么错误，我们补齐看看

a = np.array([[0, 54, 91, 153, 177,1],
  [0, 50, 89, 147, 196,0],
  [0, 38, 79, 157,0,0],
  [0, 49, 89, 147, 177,0],
  [0, 32, 73, 145,0,0]])

"""
typeerror: only size-1 arrays can be converted to python scalars
"""

这就是因为返回的不是list，而是numpy.ndarry,而_int64_feature函数中先判断numpy.ndarry不是list，所以转成了[numpy.ndarry]就报错了。可以做些修改，一种方法是将numpy.ndarry转为list

for i in range(a.shape[0]): # i = 0 ~ 4
 feature = {'i' : _int64_feature(i), 
  'data': _int64_feature(a[i].tolist())}

这样补齐了我们就可以修改batch的值了

dataset = dataset.batch(2)

"""
[array([0, 2], dtype=int64), array([[ 0, 54, 91, 153, 177, 1],
 [ 0, 38, 79, 157, 0, 0]], dtype=int64)]
[array([1, 3], dtype=int64), array([[ 0, 50, 89, 147, 196, 0],
 [ 0, 49, 89, 147, 177, 0]], dtype=int64)]
[array([4, 0], dtype=int64), array([[ 0, 32, 73, 145, 0, 0],
 [ 0, 54, 91, 153, 177, 1]], dtype=int64)]
"""

当然tensorflow不会让我自己补齐，已经提供了补齐函数，

# -*- coding: utf-8 -*-

import tensorflow as tf

def _int64_feature(value):
 if not isinstance(value,list):
 value = [value]
 return tf.train.feature(int64_list=tf.train.int64list(value=value))

a = [[0, 54, 91, 153, 177,1],
  [0, 50, 89, 147, 196],
  [0, 38, 79, 157],
  [0, 49, 89, 147, 177],
  [0, 32, 73, 145]]

writer = tf.python_io.tfrecordwriter('file')

for v in a: # i = 0 ~ 4
 feature = {'data': _int64_feature(v)}

 # create an example protocol buffer
 example = tf.train.example(features=tf.train.features(feature=feature))

 # serialize to string and write on the file
 writer.write(example.serializetostring())

writer.close()


# use dataset api to read the tfrecord file.
filenames = ["file"]
dataset = tf.data.tfrecorddataset(filenames)
def _parse_function(example_proto):
 keys_to_features = {'data':tf.varlenfeature(tf.int64)}
 parsed_features = tf.parse_single_example(example_proto, keys_to_features)
 return tf.sparse_tensor_to_dense( parsed_features['data'])

dataset = dataset.map(_parse_function)
dataset = dataset.shuffle(buffer_size=1)
dataset = dataset.repeat() 
dataset = dataset.padded_batch(2,padded_shapes=([none]))
iterator = dataset.make_one_shot_iterator()
data = iterator.get_next()
with tf.session() as sess:
 print(sess.run([data]))
 print(sess.run([data]))
 print(sess.run([data]))


"""
[array([[ 0, 54, 91, 153, 177, 1],
 [ 0, 50, 89, 147, 196, 0]])]
[array([[ 0, 38, 79, 157, 0],
 [ 0, 49, 89, 147, 177]])]
[array([[ 0, 32, 73, 145, 0, 0],
 [ 0, 54, 91, 153, 177, 1]])]
"""

可以看到的确是自动补齐了。

图片batch

直接来测试一下图片数据

# -*- coding: utf-8 -*-

import tensorflow as tf
import matplotlib.pyplot as plt
def _byte_feature(value):
 return tf.train.feature(bytes_list=tf.train.byteslist(value=[value]))

files = tf.gfile.glob('*.jpeg')
writer = tf.python_io.tfrecordwriter('file')
for file in files:

 with tf.gfile.fastgfile(file,'rb') as f:
 img_buff = f.read()
 feature = {'img': _byte_feature(tf.compat.as_bytes(img_buff))}
 example = tf.train.example(features=tf.train.features(feature=feature))
 writer.write(example.serializetostring())
writer.close()


filenames = ["file"]
dataset = tf.data.tfrecorddataset(filenames)
def _parse_function(example_proto):
 keys_to_features = {'img':tf.fixedlenfeature([], tf.string)}
 parsed_features = tf.parse_single_example(example_proto, keys_to_features)
 image = tf.image.decode_jpeg(parsed_features['img'])
 return image

dataset = dataset.map(_parse_function)
dataset = dataset.shuffle(buffer_size=1)
dataset = dataset.repeat() 
dataset = dataset.batch(2)
iterator = dataset.make_one_shot_iterator()
image = iterator.get_next()

with tf.session() as sess:
 img = sess.run([image])
 print(len(img))
 print(img[0].shape)
 plt.imshow(img[0][0])

"""
cannot batch tensors with different shapes in component 0. first element had shape [440,440,3] and element 1 had shape [415,438,3].
"""

看到了没有，一个batch中图片的尺寸不同，就不可以batch了，我们必须要将一个batch的图片resize成相同的代大小。

def _parse_function(example_proto):
 keys_to_features = {'img':tf.fixedlenfeature([], tf.string)}
 parsed_features = tf.parse_single_example(example_proto, keys_to_features)
 image = tf.image.decode_jpeg(parsed_features['img'])
 image = tf.image.convert_image_dtype(image,tf.float32)# 直接resize，会将uint8转为float类型，但是plt.imshow只能显示uint8或者0-1之间float类型，这个函数就是将uint8转为0-1之间的float类型，相当于除以255.0
 image = tf.image.resize_images(image,(224,224))
 return image

但是有时候我们希望输入图片尺寸是不一样的，不需要reize，这样只能将batch_size=1。一个batch中的图片shape必须是一样的，我们可以这样折中训练，使用tensorflow提供的动态填充接口，将一个batch中的图片填充为相同的shape。

dataset = dataset.padded_batch(2,padded_shapes=([none,none,3]))

如果我们想要将图片的名称作为标签保存下来要怎么做呢？

# -*- coding: utf-8 -*-

import tensorflow as tf
import matplotlib.pyplot as plt
import os

out_charset="abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz0123456789"

def _byte_feature(value):
 return tf.train.feature(bytes_list=tf.train.byteslist(value=[value]))

def _int64_feature(values):
 if not isinstance(values,list):
 values = [values]
 return tf.train.feature(int64_list=tf.train.int64list(value=values))

files = tf.gfile.glob('*.jpg')
writer = tf.python_io.tfrecordwriter('file')
for file in files:
 with tf.gfile.fastgfile(file,'rb') as f:
 img_buff = f.read()
 filename = os.path.basename(file).split('.')[0]
 label = list(map(lambda x:out_charset.index(x),filename))
 feature = {'label':_int64_feature(label),
  'filename':_byte_feature(tf.compat.as_bytes(filename)),
  'img': _byte_feature(tf.compat.as_bytes(img_buff))}
 example = tf.train.example(features=tf.train.features(feature=feature))
 writer.write(example.serializetostring())
writer.close()


filenames = ["file"]
dataset = tf.data.tfrecorddataset(filenames)
def _parse_function(example_proto):
 keys_to_features = {
  'label':tf.varlenfeature(tf.int64),
  'filename':tf.fixedlenfeature([],tf.string),
  'img':tf.fixedlenfeature([], tf.string)}
 parsed_features = tf.parse_single_example(example_proto, keys_to_features)
 label = tf.sparse_tensor_to_dense(parsed_features['label'])
 filename = parsed_features['filename']
 image = tf.image.decode_jpeg(parsed_features['img'])
 return image,label,filename

dataset = dataset.map(_parse_function)
dataset = dataset.shuffle(buffer_size=1)
dataset = dataset.repeat() 
dataset = dataset.padded_batch(3,padded_shapes=([none,none,3],[none],[]))
#因为返回有三个，所以每一个都要有padded_shapes,但是解码后的image和label都是变长的
#所以需要pad none,而filename没有解码，返回来是byte类型的，只有一个值，所以不需要pad
iterator = dataset.make_one_shot_iterator()
image,label,filename = iterator.get_next()

with tf.session() as sess:
 print(label.eval())

瞎试

如果写入的数据是一个list会是怎样呢

a = np.arange(16).reshape(2,4,2)

"""
typeerror: [0, 1] has type list, but expected one of: int, long
"""

不过想想也是，tf.train.feature(int64_list=tf.train.int64list(value=value))这个函数就是存储数据类型为int64的list的。但是如果我们要存储词向量该怎么办呢？例如一句话是一个样本s1='我爱你',假如使用one-hot编码，我=[0,0,1],爱=[0,1,0],你=[1,0,0],s1=[[0,0,1],[0,1,0],[1,0,0]]。这一个样本该怎么存储呢？

以上这篇tensorflow 变长序列存储实例就是小编分享给大家的全部内容了，希望能给大家一个参考，也希望大家多多支持移动技术网。

您可能感兴趣的文章:

如对本文有疑问，请在下面进行留言讨论，广大热心网友会与你互动！！点击进行留言回复

python如何查看网页代码

用python查看网页代码的方法：1、使用“import”导入requests包import requests2、使用requests包的get()函数通过网页... [阅读全文]
Python如何用wx模块创建文本编辑器

用python的wx模块创建文本编辑器的方法：1、设置按钮的位置import wxapp = wx.app()win = wx.frame(none,title... [阅读全文]
python如何保存文本文件

python保存文本文件的方法：使用python内置的open()类可以打开文本文件，向文件里面写入数据可以用write()函数，写完之后，使用close()函... [阅读全文]
python如何编写win程序

python可以编写win程序。win程序的格式是exe，下面我们就来看一下使用python编写exe程序的方法。编写好python程序后py2exe模块即可将... [阅读全文]
Python替换NumPy数组中大于某个值的所有元素实例

我有一个2d(二维) numpy数组，并希望用255.0替换大于或等于阈值t的所有值。据我所知，最基础的方法是：shape = arr.shaperesult ... [阅读全文]
使用Numpy对特征中的异常值进行替换及条件替换方式

原始数据为excel文件，由传感器获得，通过pyhton xlrd模块读入，读入后为数组形式，由于其存在部分异常值和缺失值，所以便利用numpy对其中的异常值进... [阅读全文]
Python 实现将numpy中的nan和inf,nan替换成对应的均值

nan：not a numberinf：infinity;正无穷numpy中的nan和inf都是float类型t!=t 返回bool类型的数组(矩阵)np.co... [阅读全文]
给ubuntu18安装python3.7的详细教程

参考文章准备工作安装工具sudo apt updatesudo apt upgradesudo apt install gccsudo apt install ... [阅读全文]
python爬虫把url链接编码成gbk2312格式过程解析

1. 问题　　抓取某个网站，发现请求参数是乱码格式，这是点击 textview，发现请求参数如下图所示3. 那么=%b9%fa%ce%f1%d4%ba%b7%a... [阅读全文]
pyecharts在数据可视化中的应用详解

使用pyecharts进行数据可视化安装 pip install pyecharts也可以在pycharm软件里进行下载pyecharts库包。下载成功后进行查... [阅读全文]

网友评论


验证码：

tensorflow 变长序列存储实例

2020年03月09日 | 移动技术网IT编程 | 我要评论

您可能感兴趣的文章:

相关文章:

网友评论