In [5]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding


Using TensorFlow backend.


### TASK

Language Model - $P(X_{t+1}|X_{1:t})$. Given a sequence of text, we try to model the next word. If we do this process recursively, we will be able to generate language.

In [6]:
data = """ Jack and Jill went up the hill\n
         To fetch a pail of water\n
         Jack fell down and broke his crown\n
         And Jill came tumbling after\n """

In [7]:
data = """To be, or not to be, that is the question\n
Whether it is nobler in the mind to suffer\n
The slings and arrows of outrageous fortune\n
Or to take arms against a sea of troubles\n
And by opposing end them. To die—to sleep \n
No more; and by a sleep to say we end \n
The heart-ache and the thousand natural shocks \n
That flesh is heir to: 'tis a consummation \n
Devoutly to be wish'd. To die, to sleep \n
To sleep, perchance to dream—ay, there's the rub \n
For in that sleep of death what dreams may come \n 
When we have shuffled off this mortal coil\n
Must give us pause—there's the respect\n
That makes calamity of so long life\n"""


#### SEQUENCE GENERATOR

In [8]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
    # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # pre-pad sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
    return in_text

#### TOKENIZE

In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
# determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
# create line-based sequences
sequences = list()
for line in data.split('\n'):
    encoded = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

Vocabulary Size: 76
Total Sequences: 102


#### PAD SENTENCES TO SAME LENGTH

In [11]:
# pad input sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)

Max Sequence Length: 10


#### SPLIT INPUTS

In [12]:
# split into input and output elements
sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)# define model


#### MODEL

In [13]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_length-1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 9, 10)             760       
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dense_1 (Dense)              (None, 76)                3876      
Total params: 16,836
Trainable params: 16,836
Non-trainable params: 0
_________________________________________________________________
None


#### TRAIN

In [14]:
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(X, y, epochs=500, verbose=2)

Epoch 1/500
 - 2s - loss: 4.3306 - acc: 0.0490
Epoch 2/500
 - 0s - loss: 4.3260 - acc: 0.0980
Epoch 3/500
 - 0s - loss: 4.3221 - acc: 0.0980
Epoch 4/500
 - 0s - loss: 4.3179 - acc: 0.0980
Epoch 5/500
 - 0s - loss: 4.3118 - acc: 0.0980
Epoch 6/500
 - 0s - loss: 4.3054 - acc: 0.0980
Epoch 7/500
 - 0s - loss: 4.2963 - acc: 0.0980
Epoch 8/500
 - 0s - loss: 4.2829 - acc: 0.0980
Epoch 9/500
 - 0s - loss: 4.2636 - acc: 0.0980
Epoch 10/500
 - 0s - loss: 4.2337 - acc: 0.0980
Epoch 11/500
 - 0s - loss: 4.1839 - acc: 0.0980
Epoch 12/500
 - 0s - loss: 4.1220 - acc: 0.0980
Epoch 13/500
 - 0s - loss: 4.0688 - acc: 0.0980
Epoch 14/500
 - 0s - loss: 4.0722 - acc: 0.0980
Epoch 15/500
 - 0s - loss: 4.0543 - acc: 0.0980
Epoch 16/500
 - 0s - loss: 4.0358 - acc: 0.0980
Epoch 17/500
 - 0s - loss: 4.0245 - acc: 0.0980
Epoch 18/500
 - 0s - loss: 4.0158 - acc: 0.0980
Epoch 19/500
 - 0s - loss: 4.0098 - acc: 0.0980
Epoch 20/500
 - 0s - loss: 4.0011 - acc: 0.0980
Epoch 21/500
 - 0s - loss: 3.9928 - acc: 0.0980
E

Epoch 171/500
 - 0s - loss: 1.8934 - acc: 0.5686
Epoch 172/500
 - 0s - loss: 1.8872 - acc: 0.5294
Epoch 173/500
 - 0s - loss: 1.8810 - acc: 0.5294
Epoch 174/500
 - 0s - loss: 1.8698 - acc: 0.5490
Epoch 175/500
 - 0s - loss: 1.8651 - acc: 0.5490
Epoch 176/500
 - 0s - loss: 1.8441 - acc: 0.5588
Epoch 177/500
 - 0s - loss: 1.8403 - acc: 0.5686
Epoch 178/500
 - 0s - loss: 1.8225 - acc: 0.5784
Epoch 179/500
 - 0s - loss: 1.8286 - acc: 0.5784
Epoch 180/500
 - 0s - loss: 1.8120 - acc: 0.5588
Epoch 181/500
 - 0s - loss: 1.7986 - acc: 0.5980
Epoch 182/500
 - 0s - loss: 1.7991 - acc: 0.6078
Epoch 183/500
 - 0s - loss: 1.7824 - acc: 0.5784
Epoch 184/500
 - 0s - loss: 1.7722 - acc: 0.5980
Epoch 185/500
 - 0s - loss: 1.7645 - acc: 0.5980
Epoch 186/500
 - 0s - loss: 1.7534 - acc: 0.5980
Epoch 187/500
 - 0s - loss: 1.7460 - acc: 0.5784
Epoch 188/500
 - 0s - loss: 1.7417 - acc: 0.5686
Epoch 189/500
 - 0s - loss: 1.7312 - acc: 0.5686
Epoch 190/500
 - 0s - loss: 1.7226 - acc: 0.5784
Epoch 191/500
 - 0s 

 - 0s - loss: 0.8778 - acc: 0.8333
Epoch 339/500
 - 0s - loss: 0.8598 - acc: 0.8333
Epoch 340/500
 - 0s - loss: 0.8526 - acc: 0.8431
Epoch 341/500
 - 0s - loss: 0.8460 - acc: 0.8529
Epoch 342/500
 - 0s - loss: 0.8441 - acc: 0.8431
Epoch 343/500
 - 0s - loss: 0.8401 - acc: 0.8431
Epoch 344/500
 - 0s - loss: 0.8377 - acc: 0.8235
Epoch 345/500
 - 0s - loss: 0.8282 - acc: 0.8529
Epoch 346/500
 - 0s - loss: 0.8198 - acc: 0.8431
Epoch 347/500
 - 0s - loss: 0.8224 - acc: 0.8431
Epoch 348/500
 - 0s - loss: 0.8157 - acc: 0.8431
Epoch 349/500
 - 0s - loss: 0.8146 - acc: 0.8529
Epoch 350/500
 - 0s - loss: 0.8068 - acc: 0.8627
Epoch 351/500
 - 0s - loss: 0.8286 - acc: 0.8333
Epoch 352/500
 - 0s - loss: 0.8111 - acc: 0.8529
Epoch 353/500
 - 0s - loss: 0.8028 - acc: 0.8431
Epoch 354/500
 - 0s - loss: 0.8032 - acc: 0.8627
Epoch 355/500
 - 0s - loss: 0.7909 - acc: 0.8627
Epoch 356/500
 - 0s - loss: 0.7868 - acc: 0.8529
Epoch 357/500
 - 0s - loss: 0.7831 - acc: 0.8529
Epoch 358/500
 - 0s - loss: 0.7765

<keras.callbacks.History at 0x1f46632f128>

#### EVALUATE

In [15]:
# evaluate model
print(generate_seq(model, tokenizer, max_length-1, 'The slings and', 4))
print(generate_seq(model, tokenizer, max_length-1, 'Or to', 4))
print(generate_seq(model, tokenizer, max_length-1, 'Must give', 4))

The slings and arrows of outrageous fortune
Or to take arms against a
Must give us pause—there's the respect
