深度学习-序列模型 吴恩达第五课第三周作业2答案(Trigger Word Detection)

    xiaoxiao2023-11-19  158

    import numpy as np from pydub import AudioSegment import random import sys import io import os import glob import IPython from td_utils import * %matplotlib inline

    IPython.display.Audio("./raw_data/activates/1.wav") IPython.display.Audio("./raw_data/negatives/4.wav") IPython.display.Audio("./raw_data/backgrounds/1.wav")

    IPython.display.Audio("audio_examples/example_train.wav") x = graph_spectrogram("audio_examples/example_train.wav")

    _, data = wavfile.read("audio_examples/example_train.wav") print("Time steps in audio recording before spectrogram", data[:,0].shape) print("Time steps in input after spectrogram", x.shape) Time steps in audio recording before spectrogram (441000,) Time steps in input after spectrogram (101, 5511) Tx = 5511 # The number of time steps input to the model from the spectrogram n_freq = 101 # Number of frequencies input to the model at each time step of the spectrogram

    Ty = 1375 # The number of time steps in the output of our model

    # Load audio segments using pydub activates, negatives, backgrounds = load_raw_audio() print("background len: " + str(len(backgrounds[0]))) # Should be 10,000, since it is a 10 sec clip print("activate[0] len: " + str(len(activates[0]))) # Maybe around 1000, since an "activate" audio clip is usually around 1 sec (but varies a lot) print("activate[1] len: " + str(len(activates[1]))) # Different "activate" clips can have different lengths background len: 10000 activate[0] len: 655 activate[1] len: 725

    def get_random_time_segment(segment_ms): """ Gets a random time segment of duration segment_ms in a 10,000 ms audio clip. Arguments: segment_ms -- the duration of the audio clip in ms ("ms" stands for "milliseconds") Returns: segment_time -- a tuple of (segment_start, segment_end) in ms """ segment_start = np.random.randint(low=0, high=10000-segment_ms) # Make sure segment doesn't run past the 10sec background segment_end = segment_start + segment_ms - 1 return (segment_start, segment_end)

    # GRADED FUNCTION: is_overlapping def is_overlapping(segment_time, previous_segments): """ Checks if the time of a segment overlaps with the times of existing segments. Arguments: segment_time -- a tuple of (segment_start, segment_end) for the new segment previous_segments -- a list of tuples of (segment_start, segment_end) for the existing segments Returns: True if the time segment overlaps with any of the existing segments, False otherwise """ segment_start, segment_end = segment_time ### START CODE HERE ### (≈ 4 line) # Step 1: Initialize overlap as a "False" flag. (≈ 1 line) overlap = False # Step 2: loop over the previous_segments start and end times. # Compare start/end times and set the flag to True if there is an overlap (≈ 3 lines) for previous_start, previous_end in previous_segments: if (segment_start >= previous_start and segment_start <= previous_end) or \ (segment_end >= previous_start and segment_end <= previous_end): overlap = True ### END CODE HERE ### return overlap overlap1 = is_overlapping((950, 1430), [(2000, 2550), (260, 949)]) overlap2 = is_overlapping((2305, 2950), [(824, 1532), (1900, 2305), (3424, 3656)]) print("Overlap 1 = ", overlap1) print("Overlap 2 = ", overlap2) Overlap 1 = False Overlap 2 = True

    # GRADED FUNCTION: insert_audio_clip def insert_audio_clip(background, audio_clip, previous_segments): """ Insert a new audio segment over the background noise at a random time step, ensuring that the audio segment does not overlap with existing segments. Arguments: background -- a 10 second background audio recording. audio_clip -- the audio clip to be inserted/overlaid. previous_segments -- times where audio segments have already been placed Returns: new_background -- the updated background audio """ # Get the duration of the audio clip in ms segment_ms = len(audio_clip) ### START CODE HERE ### # Step 1: Use one of the helper functions to pick a random time segment onto which to insert # the new audio clip. (≈ 1 line) segment_time = get_random_time_segment(segment_ms) # Step 2: Check if the new segment_time overlaps with one of the previous_segments. If so, keep # picking new segment_time at random until it doesn't overlap. (≈ 2 lines) while is_overlapping(segment_time, previous_segments): segment_time = get_random_time_segment(segment_ms) # Step 3: Add the new segment_time to the list of previous_segments (≈ 1 line) previous_segments.append(segment_time) ### END CODE HERE ### # Step 4: Superpose audio segment and background new_background = background.overlay(audio_clip, position = segment_time[0]) return new_background, segment_time np.random.seed(5) audio_clip, segment_time = insert_audio_clip(backgrounds[0], activates[0], [(3790, 4400)]) audio_clip.export("insert_test.wav", format="wav") print("Segment Time: ", segment_time) IPython.display.Audio("insert_test.wav") Segment Time: (2915, 3569) # Expected audio IPython.display.Audio("audio_examples/insert_reference.wav")

    # GRADED FUNCTION: insert_ones def insert_ones(y, segment_end_ms): """ Update the label vector y. The labels of the 50 output steps strictly after the end of the segment should be set to 1. By strictly we mean that the label of segment_end_y should be 0 while, the 50 followinf labels should be ones. Arguments: y -- numpy array of shape (1, Ty), the labels of the training example segment_end_ms -- the end time of the segment in ms Returns: y -- updated labels """ Ty = y.shape[1] # duration of the background (in terms of spectrogram time-steps) segment_end_y = int(segment_end_ms * Ty / 10000.0) # Add 1 to the correct index in the background label (y) ### START CODE HERE ### (≈ 3 lines) for i in range(segment_end_y+1, segment_end_y+51): if i < Ty: y[0, i] = 1 ### END CODE HERE ### return y arr1 = insert_ones(np.zeros((1, Ty)), 9700) plt.plot(insert_ones(arr1, 4251)[0,:]) print("sanity checks:", arr1[0][1333], arr1[0][634], arr1[0][635]) sanity checks: 0.0 1.0 0.0

    # GRADED FUNCTION: create_training_example def create_training_example(background, activates, negatives): """ Creates a training example with a given background, activates, and negatives. Arguments: background -- a 10 second background audio recording activates -- a list of audio segments of the word "activate" negatives -- a list of audio segments of random words that are not "activate" Returns: x -- the spectrogram of the training example y -- the label at each time step of the spectrogram """ # Set the random seed np.random.seed(18) # Make background quieter background = background - 20 ### START CODE HERE ### # Step 1: Initialize y (label vector) of zeros (≈ 1 line) y = np.zeros((1, Ty)) # Step 2: Initialize segment times as empty list (≈ 1 line) previous_segments = [] ### END CODE HERE ### # Select 0-4 random "activate" audio clips from the entire list of "activates" recordings number_of_activates = np.random.randint(0, 5) random_indices = np.random.randint(len(activates), size=number_of_activates) random_activates = [activates[i] for i in random_indices] ### START CODE HERE ### (≈ 3 lines) # Step 3: Loop over randomly selected "activate" clips and insert in background for random_activate in random_activates: # Insert the audio clip on the background background, segment_time = insert_audio_clip(background, random_activate, previous_segments) # Retrieve segment_start and segment_end from segment_time segment_start, segment_end = segment_time # Insert labels in "y" y = insert_ones(y, segment_end) ### END CODE HERE ### # Select 0-2 random negatives audio recordings from the entire list of "negatives" recordings number_of_negatives = np.random.randint(0, 3) random_indices = np.random.randint(len(negatives), size=number_of_negatives) random_negatives = [negatives[i] for i in random_indices] ### START CODE HERE ### (≈ 2 lines) # Step 4: Loop over randomly selected negative clips and insert in background for random_negative in random_negatives: # Insert the audio clip on the background background, _ = insert_audio_clip(background, random_negative, previous_segments) ### END CODE HERE ### # Standardize the volume of the audio clip background = match_target_amplitude(background, -20.0) # Export new training example file_handle = background.export("train" + ".wav", format="wav") print("File (train.wav) was saved in your directory.") # Get and plot spectrogram of the new recording (background with superposition of positive and negatives) x = graph_spectrogram("train.wav") return x, y x, y = create_training_example(backgrounds[0], activates, negatives)

    # Load preprocessed training examples X = np.load("./XY_train/X.npy") Y = np.load("./XY_train/Y.npy")

    # Load preprocessed dev set examples X_dev = np.load("./XY_dev/X_dev.npy") Y_dev = np.load("./XY_dev/Y_dev.npy")

    from keras.callbacks import ModelCheckpoint from keras.models import Model, load_model, Sequential from keras.layers import Dense, Activation, Dropout, Input, Masking, TimeDistributed, LSTM, Conv1D from keras.layers import GRU, Bidirectional, BatchNormalization, Reshape from keras.optimizers import Adam

    # GRADED FUNCTION: model def model(input_shape): """ Function creating the model's graph in Keras. Argument: input_shape -- shape of the model's input data (using Keras conventions) Returns: model -- Keras model instance """ X_input = Input(shape = input_shape) ### START CODE HERE ### # Step 1: CONV layer (≈4 lines) X = Conv1D(196, 15, strides=4)(X_input) # CONV1D X = BatchNormalization()(X) # Batch normalization X = Activation('relu')(X) # ReLu activation X = Dropout(0.8)(X) # dropout (use 0.8) # Step 2: First GRU Layer (≈4 lines) X = GRU(128, return_sequences=True)(X) # GRU (use 128 units and return the sequences) X = Dropout(0.8)(X) # dropout (use 0.8) X = BatchNormalization()(X) # Batch normalization # Step 3: Second GRU Layer (≈4 lines) X = GRU(128, return_sequences=True)(X) # GRU (use 128 units and return the sequences) X = Dropout(0.8)(X) # dropout (use 0.8) X = BatchNormalization()(X) # Batch normalization X = Dropout(0.8)(X) # dropout (use 0.8) # Step 4: Time-distributed dense layer (≈1 line) X = TimeDistributed(Dense(1, activation = "sigmoid"))(X) # time distributed (sigmoid) ### END CODE HERE ### model = Model(inputs = X_input, outputs = X) return model model = model(input_shape = (Tx, n_freq)) model.summary() _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= input_1 (InputLayer) (None, 5511, 101) 0 _________________________________________________________________ conv1d_1 (Conv1D) (None, 1375, 196) 297136 _________________________________________________________________ batch_normalization_1 (Batch (None, 1375, 196) 784 _________________________________________________________________ activation_1 (Activation) (None, 1375, 196) 0 _________________________________________________________________ dropout_1 (Dropout) (None, 1375, 196) 0 _________________________________________________________________ gru_1 (GRU) (None, 1375, 128) 124800 _________________________________________________________________ dropout_2 (Dropout) (None, 1375, 128) 0 _________________________________________________________________ batch_normalization_2 (Batch (None, 1375, 128) 512 _________________________________________________________________ gru_2 (GRU) (None, 1375, 128) 98688 _________________________________________________________________ dropout_3 (Dropout) (None, 1375, 128) 0 _________________________________________________________________ batch_normalization_3 (Batch (None, 1375, 128) 512 _________________________________________________________________ dropout_4 (Dropout) (None, 1375, 128) 0 _________________________________________________________________ time_distributed_1 (TimeDist (None, 1375, 1) 129 ================================================================= Total params: 522,561 Trainable params: 521,657 Non-trainable params: 904

    model = load_model('./models/tr_model.h5') opt = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, decay=0.01) model.compile(loss='binary_crossentropy', optimizer=opt, metrics=["accuracy"]) model.fit(X, Y, batch_size = 5, epochs=1) Epoch 1/1 26/26 [==============================] - 11s 423ms/step - loss: 0.0726 - acc: 0.9805

    loss, acc = model.evaluate(X_dev, Y_dev) print("Dev set accuracy = ", acc) 25/25 [==============================] - 1s 27ms/step Dev set accuracy = 0.9312872886657715

    def detect_triggerword(filename): plt.subplot(2, 1, 1) x = graph_spectrogram(filename) # the spectogram outputs (freqs, Tx) and we want (Tx, freqs) to input into the model x = x.swapaxes(0,1) x = np.expand_dims(x, axis=0) predictions = model.predict(x) plt.subplot(2, 1, 2) plt.plot(predictions[0,:,0]) plt.ylabel('probability') plt.show() return predictions

    chime_file = "audio_examples/chime.wav" def chime_on_activate(filename, predictions, threshold): audio_clip = AudioSegment.from_wav(filename) chime = AudioSegment.from_wav(chime_file) Ty = predictions.shape[1] # Step 1: Initialize the number of consecutive output steps to 0 consecutive_timesteps = 0 # Step 2: Loop over the output steps in the y for i in range(Ty): # Step 3: Increment consecutive output steps consecutive_timesteps += 1 # Step 4: If prediction is higher than the threshold and more than 75 consecutive output steps have passed if predictions[0,i,0] > threshold and consecutive_timesteps > 75: # Step 5: Superpose audio and background using pydub audio_clip = audio_clip.overlay(chime, position = ((i / Ty) * audio_clip.duration_seconds)*1000) # Step 6: Reset consecutive output steps to 0 consecutive_timesteps = 0 audio_clip.export("chime_output.wav", format='wav')

    filename = "./raw_data/dev/1.wav" prediction = detect_triggerword(filename) chime_on_activate(filename, prediction, 0.5) IPython.display.Audio("./chime_output.wav")

    filename = "./raw_data/dev/2.wav" prediction = detect_triggerword(filename) chime_on_activate(filename, prediction, 0.5) IPython.display.Audio("./chime_output.wav")

    # Preprocess the audio to the correct format def preprocess_audio(filename): # Trim or pad audio segment to 10000ms padding = AudioSegment.silent(duration=10000) segment = AudioSegment.from_wav(filename)[:10000] segment = padding.overlay(segment) # Set frame rate to 44100 segment = segment.set_frame_rate(44100) # Export as wav segment.export(filename, format='wav') your_filename = "audio_examples/my_audio.wav" preprocess_audio(your_filename) IPython.display.Audio(your_filename) # listen to the audio you uploaded chime_threshold = 0.5 prediction = detect_triggerword(your_filename) chime_on_activate(your_filename, prediction, chime_threshold) IPython.display.Audio("./chime_output.wav")

    最新回复(0)