It is known that, at the moment, TF\'s Record documentation leaves something to be desired.
My question is in regards to what is optimal for storing:
Note: this is not an answer to the question (whether Example or SequenceExample is better and whether or not a sequence should be broken down into channels or as a byte string)
Rather, it occurred to me whilst looking at TensorFlow Records tutorials, posts, videos, etc that most examples (that I encountered) focused on constructing the (Sequence)Example with concrete data and did not show how one could be made more dynamically. Thus I encapsulated the four methods above for converting data of the described type in the example.
While still tied to the data we are trying to create an (Sequence)Example around, hopefully for those who are still somewhat confused about the format - in addition to the concrete examples above - this might be of use.
Here is some code to play around with. Feedback is welcome.
This has been condensed into a package named Feature Input / Output (FIO).
Here is a Colab demonstrating how to use it.
Namely, it introduces the concept of a "schema"
:
SCHEMA = {
'my-feature': {'length': 'fixed', 'dtype': tf.string, 'shape': []},
'seq': {
'length': 'fixed',
'dtype': tf.int64,
'shape': [4, 3],
'encode': 'channels',
'channel_names': ['A', 'B', 'C'],
'data_format': 'channels_last'
}
}
which allows you to define your data _once_ rather than twice (once to encode into example, and once to extract from a record).
import os, sys, json
sys.path.insert(0, '../')
import tensorflow as tf
import numpy as np
def list_like_q(value) -> bool:
'''
TensorFlow tf.train.Feature requires a list of feature values.
Many values used in practice are either python lists or numpy.ndarrays.
We often have features which consist of a singular value.
For brevity, we define some light helper functions to wrap a list as a
tf.train.Feature. This lets us test if we need to wrap the value.
'''
# import numpy as np
return (type(value) is list or type(value) is np.ndarray)
def take_all() -> slice: return slice(None, None, None)
def take_channel(sequence, channel:int, data_format:str='channels_last'):
slices = [channel, take_all()]
if data_format != 'channels_last': slices.reverse()
return sequence[tuple(slices)]
def number_of_channels(sequence, data_format:str='channels_last') -> int:
return sequence.shape[-1] if data_format == 'channels_last' else sequence.shape[0]
def feature_int64(value):
'''Takes value and wraps into tf.train.Feature(Int64List)'''
if not list_like_q(value): value = [value]
return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
def feature_float(value):
'''Takes value and wraps into tf.train.Feature(FloatList)'''
if not list_like_q(value): value = [value]
return tf.train.Feature(float_list=tf.train.FloatList(value=value))
def feature_bytes(value):
'''Takes value and wraps is into tf.train.Feature(BytesList).'''
if type(value) is np.ndarray: value = value.tostring()
if type(value) is not bytes: value = str(value).encode('utf-8')
if type(value) is not list: value = [value]
return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))
def feature_function(dtype):
'''
Given returns the function for wrapping a value into the
corresponding tf.train.Feature
'''
return feature_int64 if dtype == "int64" else \
feature_float if dtype == "float" else \
feature_bytes
def feature_list(iterable, dtype:str='float'):
'''Given an iterable, returns the feature list of corresponding .'''
return tf.train.FeatureList([feature_function(dtype)(item) for item in iterable])
# the next three for completeness
def feature_list_int64(value):
return tf.train.FeatureList(feature=feature_list(value, 'int64'))
def feature_list_float(value):
return tf.train.FeatureList(feature=feature_list(value, 'float'))
def feature_list_bytes(value):
return tf.train.FeatureList(feature=feature_list(value, 'bytes'))
def dict_to_features(values:dict, types:dict) -> dict:
'''
Given , maps over name:dtype pairs and wraps [name] in the
corresponding feature type.
'''
return {name: feature_function(dtype)(values[name]) for name, dtype in types.items()}
def features_from_dict(values:dict, types:dict):
return tf.train.Features(feature=dict_to_features(values, types))
def default_channel_names(sequence, data_format:str='channels_last') -> list:
'''Ensures a naming scheme as required for channel based Example'''
return [f'Channel {i}' for i in range(number_of_channels(sequence, data_format))]
def channels_to_features(sequence, dtype:str='float', data_format:str='channels_last', channel_names:list=None) -> dict:
'''
Given a of corresponding and , with optional
returns the dictionary of each channel:tf.train.Feature pair.
'''
if channel_names is None: channel_names = default_channel_names(sequence, data_format)
return {
channel: feature_function(dtype)(take_channel(sequence, i, data_format))
for i, channel in enumerate(channel_names)
}
def channels_to_feature_list(sequence, dtype:str='float', data_format:str='channels_last'):
'''
Given a of and returns the FeatureList
where each element corresponds to a channel of
'''
return tf.train.FeatureList(feature=list(channels_to_features(sequence, dtype, data_format).values()))
class SequenceRecord:
'''
SequenceRecord is a supporting class built on top of the functions found in
/model/utils/features.py with the purpose of converting our data consisting
of:
- a sequence of length n,
- n vectors of class probability vectors (refered to as pclasses), and
- metadata (name of sequence, start site, stop site, etc)
and converting it into a TensorFlow (Sequence)Example which can
subsequentially be written as a TensorFlow Record.
For both Example and SequenceExample options, the channels / classes of the
sequence / pclasses can be stored as numeric features (int64 / float) or as
a byte string. For each of these options, the encoding can be done per
channel / class, or the entire sequence / pclasses matrix.
Overwrite the following class variables to suit your needs:
_class_var || description
---------------------------------------------------------------------------
_metadata_types:dict || a dictionary of : pairs which
|| is refered to when the metadata is converted into
|| tf.train.Feature (only 'int64', 'float', 'bytes' are
|| supported for )
_sequence_data_format|| a string specifying where the channels are. By
|| default, this is set to 'channels_last'
_pclasses_data_format|| a string specifying where the channels are (by
|| default, this is set to 'channels_last')
_sequence_data_type || a string specifying what dtype channels should be
|| encoded as (by default 'int64')
_pclasses_data_type || a string specifying what dtype channels should be
|| encoded as (by default 'float')
_channel_names || a list of strings specifying the name and order
|| channels appear in (by default set to
|| None)
_classes_names || a list of strings specifying the name and order
|| classes appear as channels in (by default
|| set to None)
'''
_metadata_types = {}
_sequence_data_format = 'channels_last'
_pclasses_data_format = 'channels_last'
_sequence_data_type = 'int64'
_pclasses_data_type = 'float'
_channel_names = None
_classes_names = None
def make_example(self, sequence, pclasses, metadata:dict={}, form:str='example', by:str='channels'):
'''
The core function of SequenceRecord. Given , and
converts them to the corresponing
sequences = np.array([
# sequence 1
[
# el1, el2, el3
[ 1, 1, 1], # channel 1
[ 2, 2, 2], # channel 2
[ 3, 3, 3], # channel 3
],
#sequence 2
[
[ 10, 10, 10], # channel 1
[ 20, 20, 20], # channel 2
[ 30, 30, 30], # channel 3
]
])
pclasses = np.array([
# sequence 1
[
# cls1, cls2, cls3
[ 0, 0.9, 0.1], # class probabilities element 1
[ 0, 0.1, 0.9], # class probabilities element 2
[ 0.8, 0.1, 0.1] # class probabilities element 3
],
# sequence 2
[
# cls1, cls2, cls3
[ 0.8, 0.1, 0.1], # class probabilities element 3
[ 0, 0.1, 0.9], # class probabilities element 2
[ 0, 0.9, 0.1] # class probabilities element 1
]
])
metadata = [
{'Name': 'sequence 1', 'Val_1': 100, 'Val_2': 10},
{'Name': 'sequence 2', 'Val_1': 10, 'Val_2': 100}
]
metatypes = {'Name': 'bytes', 'Val_1': 'float', 'Val_2': 'float'}
SequenceRecord._channel_names = ['Channel 1', 'Channel 2', 'Channel 3']
SequenceRecord._classes_names = ['Class A', 'Class B', 'Class C']
SequenceRecord._metadata_types = metatypes
SR = SequenceRecord()
SR.make_example(sequences[0], pclasses[0], metadata[0], form='example', by='channels')
SR.make_example(sequences[0], pclasses[0], metadata[0], form='example', by='bstrings')
SR.make_example(sequences[0], pclasses[0], metadata[0], form='example', by='bdstring')
SR.make_example(sequences[0], pclasses[0], metadata[0], form='sequence', by='channels')
SR.make_example(sequences[0], pclasses[0], metadata[0], form='sequence', by='bstrings')
SR.make_example(sequences[0], pclasses[0], metadata[0], form='sequence', by='bdstring')