使用GitLab或者Github简单实用地将数据导入Colab的方法

xiaoxiao2023-09-29 161

Google Colab为我们提供了免费的GPU和TPU计算资源。对于暂时没有自己的本地GPU显卡资源深度学习科研人员而言真是个福利。

这篇博客主要要解决的问题是：如何简单方便实用地将外部训练数据导入到Google Colab中呢？

其它博客讲到了使用Google Driver或者从本地电脑来导入数据，这些都是可以的，但是个人觉得比较复杂，而这里的方法我们只需要一行代码即可导入数据，而且很多训练数据都能够在Gitlab或者Github上面找到，也就是数据存在共享的可能性，不需要每次都需要我们自己将本地数据上传到云端（Gitlab或者Github）。

具体方法：首先在Gitlab或者Github中寻找是否已经存在该数据集，比如kaggle中狗和猫的数据集，链接为https://www.kaggle.com/c/dogs-vs-cats，如果存在就直接fork到我们的Gitlab或者github账户下面（或者直接使用该git链接），这里本人找到了一个，链接为https://gitlab.com/liangyihuai/cats_and_dogs_small。如果没有找到，就将数据从本地上传到Gitlab或者Github上面。

上传数据集到git上面之后，就可以在Colab中使用下面这一行代码将我们的数据导入到Colab的虚拟机中（虚拟的Linux文件系统）。下面的这个https://gitlab.com/liangyihuai/cats_and_dogs_small.git是点击上图中蓝色的Clone按钮之后获取的。 ! git clone https://gitlab.com/liangyihuai/cats_and_dogs_small.git 使用下面命令查看我们所导入的数据 ! ls

这样我们就完成了将数据导入到Colab的过程。下面给出在Colab上面使用我们上传的数据训练一个分类器的完整代码，访问下面链接或者直接看代码

https://colab.research.google.com/drive/1k_1MmC8jY5HYoaPcq7AQa5t7G84Wu4qL

# -*- coding: utf-8 -*- """import file from google driver.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1k_1MmC8jY5HYoaPcq7AQa5t7G84Wu4qL """ import tensorflow as tf tf.test.gpu_device_name() import os, shutil from keras import Sequential, losses, metrics from keras.layers import Flatten, Conv2D, MaxPool2D, Dense, Dropout from keras import optimizers from keras.preprocessing.image import ImageDataGenerator from keras.preprocessing import image import matplotlib.pyplot as plt ! git clone https://gitlab.com/liangyihuai/cats_and_dogs_small.git ! ls ! ls cats_and_dogs_small/ base_dir = 'cats_and_dogs_small' train_dir = os.path.join(base_dir, 'train') validation_dir = os.path.join(base_dir, 'validation') test_dir = os.path.join(base_dir, 'test') train_cats_dir = os.path.join(train_dir, 'cats') train_dogs_dir = os.path.join(train_dir, 'dogs') validation_cats_dir = os.path.join(validation_dir, 'cats') validation_dogs_dir = os.path.join(validation_dir, 'dogs') test_cats_dir = os.path.join(test_dir, 'cats') test_dogs_dir = os.path.join(test_dir, 'dogs') print('total training cat images: ', len(os.listdir(train_cats_dir))) print('total training dog images:', len(os.listdir(train_dogs_dir))) print('total validation cat images:', len(os.listdir(validation_cats_dir))) print('total validation dog images:', len(os.listdir(validation_dogs_dir))) print('total test cat images:', len(os.listdir(test_cats_dir))) print('total test dog images:', len(os.listdir(test_dogs_dir))) datagen = ImageDataGenerator( rotation_range=40, width_shift_range=0.2, height_shift_range=0.2, shear_range=0.2, zoom_range=0.2, horizontal_flip=True, fill_mode='nearest') fnames = [os.path.join(train_cats_dir, fname) for fname in os.listdir(train_cats_dir)] img_path = fnames[1] img = image.load_img(img_path, target_size=(150, 150)) x = image.img_to_array(img) x = x.reshape((1,)+x.shape) i = 0 for batch in datagen.flow(x, batch_size=1): plt.figure(i) imgplot = plt.imshow(image.array_to_img(batch[0])) i += 1 if i % 4 == 0: break plt.show() model = Sequential() model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(150, 150, 3))) model.add(MaxPool2D((2, 2))) model.add(Conv2D(64, (3, 3), activation='relu')) model.add(MaxPool2D((2, 2))) model.add(Conv2D(128, (3, 3), activation='relu')) model.add(MaxPool2D((2, 2))) model.add(Conv2D(128, (3, 3), activation='relu')) model.add(MaxPool2D((2, 2))) model.add(Flatten()) model.add(Dropout(0.5)) model.add(Dense(512, activation='relu')) model.add(Dense(1, activation='sigmoid')) model.compile(loss=losses.binary_crossentropy, optimizer=optimizers.RMSprop(lr=1e-4), metrics=[metrics.binary_accuracy]) train_datagen = ImageDataGenerator( rescale=1./255, rotation_range=40, width_shift_range=0.2, height_shift_range=0.2, shear_range=0.2, zoom_range=0.2, horizontal_flip=True,) test_datagen = ImageDataGenerator(rescale=1./255) train_generator = train_datagen.flow_from_directory( train_dir, target_size=(150, 150), batch_size=32, class_mode='binary') validation_generator = test_datagen.flow_from_directory( validation_dir, target_size=(150, 150), batch_size=32, class_mode='binary') history = model.fit_generator( train_generator, steps_per_epoch=100, epochs=100, validation_data=validation_generator, validation_steps=50) model.save('cats_and_dogs_small_2.h5') acc = history.history['binary_accuracy'] val_acc = history.history['val_binary_accuracy'] loss = history.history['loss'] val_loss = history.history['val_loss'] epochs = range(1, len(acc) + 1) plt.plot(epochs, acc, 'bo', label='Training acc') plt.plot(epochs, val_acc, 'b', label='Validation acc') plt.title('Training and validation accuracy') plt.legend() plt.figure() plt.plot(epochs, loss, 'bo', label='Training loss') plt.plot(epochs, val_loss, 'b', label='Validation loss') plt.title('Training and validation loss') plt.legend() plt.show()

谢谢

最新回复(0)