text in keras

Innocent Iguana answered on April 30, 2024 Popularity 5/10 Helpfulness 7/10
answer text in keras
text in keras

Comment
### Text preprocessing
# Tokenization : breaking a sentence into individual words
from tensorflow.keras.preprocessing.text import Tokenizer
en_tok = Tokenizer(num_words=50, oov_token='UNK')  # limit impact of rare words, replace with oov token for unseen token with 'UNK'
en_tok.fit_on_texts(en_text) 
id = en_tok.word_index["january"] # returns index of the word in the tokenized list (eg : 51)
w = en_tok.index_word[51] # returns word of specified index in the tokenized list (eg: 'january')
sentences = ["I love to play football", "He loves to play cricket"]
seqs = en_tok.texts_to_sequences(sentences) # Transforming word sequence into  tokens sequence (eg: [[26, 70, 27, 73, 7], ...])

# Padding: Add pads at the beginning or end of sequence or truncate words from the beginning or the end of sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
preproc_text = pad_sequences(seqs, padding='post', truncating='post', maxlen=9) # [18, 20, 2, 10] => [18 20 2 10 0 0 0 0 0]
pad_seq = pad_seq[:,::-1] # Reverse the sequence for stronger connection between encoder and decoder

shuffled_df = df.sample(frac=1, random_state=42).reset_index(drop=True) # Shuffle dataframe
# Make sure to add "sos" at the beginning of each sentence and "eos" at the end of each sentence (use for loop and append if required)
# Splitting data into 
train_size, valid_size = 800, 200
train_data = df.values[:train_size]     # Do this on both en_text and fr_text
valid_data = df.values[train_size:train_size+valid_size]     # Do this on both en_text and fr_text

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

en_len = 15 # number of words in a sentence
en_vocab = 150 # Total unique words found the all the text

## Helper function that takes in text and outputs into padded, embedded, reversed, one-hot vectors 
def sents2seqs(input_type, sentences, onehot=False, pad_type='post', reverse=False):     
    encoded_text = en_tok.texts_to_sequences(sentences) # Convert sentences to embedded sequences   
    preproc_text = pad_sequences(encoded_text, padding=pad_type, truncating='post', maxlen=en_len) # Padding for consistent length 
    if reverse: # Reverse the text using numpy axis reversing
      preproc_text = preproc_text[:, ::-1]
    if onehot: # Convert the word IDs to onehot vectors
        preproc_text = to_categorical(preproc_text, num_classes=en_vocab)
    return preproc_text
# Call sents2seqs to get the padded and reversed sequence of IDs
sentences = ["It is never rainy during july ."]
pad_seq = sents2seqs('source', sentences, reverse= True)
rev_sent = [en_tok.index_word[wid] for wid in pad_seq[0][-6:]] 
print('Reversed: ',' '.join(rev_sent)) # Get full sentence by joining with ' '

# Build  an encoder decoder model (NOTE : See ENCODER DECODER IN KERAS)

from keras.layers import Input, GRU, RepeatVector, TimeDistributed, Dense
from keras.models import Model

input_shape = (en_len, en_vocab)  
encoder_input = Input(shape=input_shape)
encoder_gru = GRU(48, name='gru', return_state=True)(encoder_input)
encoder_output, encoder_state = encoder_gru

decoder_input = RepeatVector(15)(encoder_state)
decoder_gru = GRU(48, return_sequences=True)(decoder_input, initial_state=encoder_state)
decoder_output = TimeDistributed(Dense(en_vocab, activation='softmax'))(decoder_gru)
model = Model(inputs=encoder_input, outputs=decoder_output)

# Train in batches
n_epochs, b_size = 5, 250 # Epoch and batch size
for ei in range(n_epochs):
    for i in range(0,data_size,b_size): # NOTE: all these _df are actually dataframe converted into list
        en_x = sents2seqs('source', train_df_en[i:i+b_size], onehot=True, pad_type='pre') # Source language
        de_y = sents2seqs('target', train_df_fr[i:i+b_size], onehot=True) # Translated language
        model.train_on_batch(en_x, de_y)
    v_en_x = sents2seqs('source', validation_df_en, onehot=True, pad_type='pre')
    v_de_y = sents2seqs('target', validation_df_fr, onehot=True)
    res = model.evaluate(v_en_x, v_de_y, batch_size=valid_size, verbose=0)
    res[0] # Loss
    res[1]*100.0 # Accuracy

# Sanity checks for debugging
en_st = ['it is sometimes chilly during decemberand freezing in june .']
en_seq = sents2seqs('source', en_st, onehot=True, reverse=True) # Transform the encoder sentence
np.argmax(en_seq, axis=-1)
fr_pred = model.predict(en_seq)
fr_pred.shape # [num sentences, sequence len, vocab size]
fr_seq = np.argmax(fr_pred, axis=-1)[0] # [[ 3 7 35 34 2 ... 5 4 4 0 0]] # take the max probability value from vocabulary dimension 
fr_seq.shape  # [num sentences, sequence len]
word_list = [fr_tok.index_word[i] for i in fr_seq if i != 0] # Filter out the words with 0 probabilities and create word list
fr_sentence = ' '.join(word_list) # Join words to get complete sentence

# NOTE : Also see TEACHER FORCING IN KERAS
Popularity 5/10 Helpfulness 7/10 Language python
Source: Grepper
Tags: keras python text
Link to this answer
Share Copy Link
Contributed on Apr 30 2024
Innocent Iguana
0 Answers Avg Quality 2/10
text in keras

Contents

More Related Answers

text in keras

Grepper

Documentation

Social

Legal

Contact

Oops, You will need to install Grepper and log-in to perform this action.