My scenario is that we have multiple peers with their own data, located in different directories, with the same sub-directory structure. I want to train the model using thos
After so many days I hope you have found the solution to the problem, but I will share another idea here so that new people like me who will face the same problem in the future, get help.
A few days ago I had this kind of problem. follow_links
will be a solution to your question, as user3731622 said. Also, I think the idea of merging two data generators will work. However, in that case, the batch sizes of the corresponding data generators have to be determined proportion to the extent of data in each relevant directory.
Batch size of sub-generators:
Where,
b = Batch Size Of Any Sub-generator
B = Desired Batch Size Of The Merged Generator
n = Number Of Images In That Directory Of Sub-generator
the sum of n = Total Number Of Images In All Directories
See the code below, this may help:
from keras.preprocessing.image import ImageDataGenerator
from keras.utils import Sequence
import matplotlib.pyplot as plt
import numpy as np
import os
class MergedGenerators(Sequence):
def __init__(self, batch_size, generators=[], sub_batch_size=[]):
self.generators = generators
self.sub_batch_size = sub_batch_size
self.batch_size = batch_size
def __len__(self):
return int(
sum([(len(self.generators[idx]) * self.sub_batch_size[idx])
for idx in range(len(self.sub_batch_size))]) /
self.batch_size)
def __getitem__(self, index):
"""Getting items from the generators and packing them"""
X_batch = []
Y_batch = []
for generator in self.generators:
if generator.class_mode is None:
x1 = generator[index % len(generator)]
X_batch = [*X_batch, *x1]
else:
x1, y1 = generator[index % len(generator)]
X_batch = [*X_batch, *x1]
Y_batch = [*Y_batch, *y1]
if self.generators[0].class_mode is None:
return np.array(X_batch)
return np.array(X_batch), np.array(Y_batch)
def build_datagenerator(dir1=None, dir2=None, batch_size=32):
n_images_in_dir1 = sum([len(files) for r, d, files in os.walk(dir1)])
n_images_in_dir2 = sum([len(files) for r, d, files in os.walk(dir2)])
# Have to set different batch size for two generators as number of images
# in those two directories are not same. As we have to equalize the image
# share in the generators
generator1_batch_size = int((n_images_in_dir1 * batch_size) /
(n_images_in_dir1 + n_images_in_dir2))
generator2_batch_size = batch_size - generator1_batch_size
generator1 = ImageDataGenerator(
rescale=1. / 255,
shear_range=0.2,
zoom_range=0.2,
rotation_range=5.,
horizontal_flip=True,
)
generator2 = ImageDataGenerator(
rescale=1. / 255,
zoom_range=0.2,
horizontal_flip=False,
)
# generator2 has different image augmentation attributes than generaor1
generator1 = generator1.flow_from_directory(
dir1,
target_size=(128, 128),
color_mode='rgb',
class_mode=None,
batch_size=generator1_batch_size,
shuffle=True,
seed=42,
interpolation="bicubic",
)
generator2 = generator2.flow_from_directory(
dir2,
target_size=(128, 128),
color_mode='rgb',
class_mode=None,
batch_size=generator2_batch_size,
shuffle=True,
seed=42,
interpolation="bicubic",
)
return MergedGenerators(
batch_size,
generators=[generator1, generator2],
sub_batch_size=[generator1_batch_size, generator2_batch_size])
def test_datagen(batch_size=32):
datagen = build_datagenerator(dir1="./asdf",
dir2="./asdf2",
batch_size=batch_size)
print("Datagenerator length (Batch count):", len(datagen))
for batch_count, image_batch in enumerate(datagen):
if batch_count == 1:
break
print("Images: ", image_batch.shape)
plt.figure(figsize=(10, 10))
for i in range(image_batch.shape[0]):
plt.subplot(1, batch_size, i + 1)
plt.imshow(image_batch[i], interpolation='nearest')
plt.axis('off')
plt.tight_layout()
test_datagen(4)
The Keras ImageDataGenerator flow_from_directory
method has a follow_links
parameter.
Maybe you can create one directory which is populated with symlinks to files in all the other directories.
This stack question discusses using symlinks with Keras ImageDataGenerator: Understanding 'follow_links' argument in Keras's ImageDataGenerator?