### Text preprocessing
# Tokenization : breaking a sentence into individual words
from tensorflow.keras.preprocessing.text import Tokenizer
en_tok = Tokenizer(num_words=50, oov_token='UNK') # limit impact of rare words, replace with oov token for unseen token with 'UNK'
en_tok.fit_on_texts(en_text)
id = en_tok.word_index["january"] # returns index of the word in the tokenized list (eg : 51)
w = en_tok.index_word[51] # returns word of specified index in the tokenized list (eg: 'january')
sentences = ["I love to play football", "He loves to play cricket"]
seqs = en_tok.texts_to_sequences(sentences) # Transforming word sequence into tokens sequence (eg: [[26, 70, 27, 73, 7], ...])
# Padding: Add pads at the beginning or end of sequence or truncate words from the beginning or the end of sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
preproc_text = pad_sequences(seqs, padding='post', truncating='post', maxlen=9) # [18, 20, 2, 10] => [18 20 2 10 0 0 0 0 0]
pad_seq = pad_seq[:,::-1] # Reverse the sequence for stronger connection between encoder and decoder
shuffled_df = df.sample(frac=1, random_state=42).reset_index(drop=True) # Shuffle dataframe
# Make sure to add "sos" at the beginning of each sentence and "eos" at the end of each sentence (use for loop and append if required)
# Splitting data into
train_size, valid_size = 800, 200
train_data = df.values[:train_size] # Do this on both en_text and fr_text
valid_data = df.values[train_size:train_size+valid_size] # Do this on both en_text and fr_text
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
en_len = 15 # number of words in a sentence
en_vocab = 150 # Total unique words found the all the text
## Helper function that takes in text and outputs into padded, embedded, reversed, one-hot vectors
def sents2seqs(input_type, sentences, onehot=False, pad_type='post', reverse=False):
encoded_text = en_tok.texts_to_sequences(sentences) # Convert sentences to embedded sequences
preproc_text = pad_sequences(encoded_text, padding=pad_type, truncating='post', maxlen=en_len) # Padding for consistent length
if reverse: # Reverse the text using numpy axis reversing
preproc_text = preproc_text[:, ::-1]
if onehot: # Convert the word IDs to onehot vectors
preproc_text = to_categorical(preproc_text, num_classes=en_vocab)
return preproc_text
# Call sents2seqs to get the padded and reversed sequence of IDs
sentences = ["It is never rainy during july ."]
pad_seq = sents2seqs('source', sentences, reverse= True)
rev_sent = [en_tok.index_word[wid] for wid in pad_seq[0][-6:]]
print('Reversed: ',' '.join(rev_sent)) # Get full sentence by joining with ' '
# Build an encoder decoder model (NOTE : See ENCODER DECODER IN KERAS)
from keras.layers import Input, GRU, RepeatVector, TimeDistributed, Dense
from keras.models import Model
input_shape = (en_len, en_vocab)
encoder_input = Input(shape=input_shape)
encoder_gru = GRU(48, name='gru', return_state=True)(encoder_input)
encoder_output, encoder_state = encoder_gru
decoder_input = RepeatVector(15)(encoder_state)
decoder_gru = GRU(48, return_sequences=True)(decoder_input, initial_state=encoder_state)
decoder_output = TimeDistributed(Dense(en_vocab, activation='softmax'))(decoder_gru)
model = Model(inputs=encoder_input, outputs=decoder_output)
# Train in batches
n_epochs, b_size = 5, 250 # Epoch and batch size
for ei in range(n_epochs):
for i in range(0,data_size,b_size): # NOTE: all these _df are actually dataframe converted into list
en_x = sents2seqs('source', train_df_en[i:i+b_size], onehot=True, pad_type='pre') # Source language
de_y = sents2seqs('target', train_df_fr[i:i+b_size], onehot=True) # Translated language
model.train_on_batch(en_x, de_y)
v_en_x = sents2seqs('source', validation_df_en, onehot=True, pad_type='pre')
v_de_y = sents2seqs('target', validation_df_fr, onehot=True)
res = model.evaluate(v_en_x, v_de_y, batch_size=valid_size, verbose=0)
res[0] # Loss
res[1]*100.0 # Accuracy
# Sanity checks for debugging
en_st = ['it is sometimes chilly during decemberand freezing in june .']
en_seq = sents2seqs('source', en_st, onehot=True, reverse=True) # Transform the encoder sentence
np.argmax(en_seq, axis=-1)
fr_pred = model.predict(en_seq)
fr_pred.shape # [num sentences, sequence len, vocab size]
fr_seq = np.argmax(fr_pred, axis=-1)[0] # [[ 3 7 35 34 2 ... 5 4 4 0 0]] # take the max probability value from vocabulary dimension
fr_seq.shape # [num sentences, sequence len]
word_list = [fr_tok.index_word[i] for i in fr_seq if i != 0] # Filter out the words with 0 probabilities and create word list
fr_sentence = ' '.join(word_list) # Join words to get complete sentence
# NOTE : Also see TEACHER FORCING IN KERAS