I just can not understand why the index is empty. Maybe not in the index business?

import numpy as np from google.colab import files uploaded = files.upload() # читаем файл START_CHAR = '\b' # будет подставляться перед началом предложения END_CHAR = '\t' # после его конца PADDING_CHAR = '\a' #будет заполнять остаток предложения до максимума длины chars = set( [START_CHAR, '\n', END_CHAR] ) #with open(uploaded, 'r') as f: for line in str(uploaded.values()): chars.update( list(line.strip().lower()) ) #print('chars', chars) char_indices = { c : i for i,c in enumerate(sorted(list(chars)) )} #sorted( char_indices[PADDING_CHAR] = 0 indices_to_chars = { i : c for c,i in char_indices.items() } num_chars = len(chars) ) # Дальше создадим векторные представления для символов; это будет просто one-hot # представление, в котором каждый символ представляется вектором с одной единицей def get_one(i, sz): res = np.zeros(sz) res[i] = 1 return res char_vectors = { c : (np.zeros(num_chars) if c == PADDING_CHAR else get_one(v, num_chars)) for c,v in char_indices.items() } # Дальше прочитаем входной файл еще раз, теперь уже деля его на предложения # и выписывая их отдельно. sentence_end_markers = set( '.!?' ) sentences = [] current_sentence = '' for line in str(uploaded.values()): s = line.lower() if len(s) > 0: current_sentence += s # + 1 if s in sentence_end_markers: current_sentence = current_sentence.strip('.!?') if len(current_sentence) > 15: current_sentence += s + "\n" sentences.append(str(current_sentence)) current_sentence = '' print('current_sentence.strip', sentences) # Следующий шаг — векторизация. Давайте определим процедуру, которая превращает # набор предложений в два тензора: X содержит векторы символов, а y — результат, # который нам нужно предсказать def get_matrices(sentences): max_sentence_len = np.max([ len(x) for x in sentences ]) X = np.zeros((len(sentences), max_sentence_len, len(chars)), dtype=np.bool) # для ускорения и экономии памяти y = np.zeros((len(sentences), max_sentence_len, len(chars)), dtype=np.bool) # для ускорения и экономии памяти for i, sentence in enumerate(sentences): char_seq = (START_CHAR + sentence + END_CHAR).ljust(max_sentence_len+1, PADDING_CHAR) for t in range(max_sentence_len): X[i, t, :] = char_vectors[char_seq[t]] y[i, t, :] = char_vectors[char_seq[t+1]] return X,y # один уровень LSTM-ячеек, результаты которых # пропускаются через один полносвязный слой, и тут же происходит классификация. from keras.models import Sequential from keras.layers import Dense, Dropout, LSTM, TimeDistributed, Activation model = Sequential() model.add(LSTM(output_dim=128, activation='tanh', return_sequences=True, input_dim=num_chars)) model.add(Dropout(0.2)) model.add(TimeDistributed(Dense(output_dim=num_chars))) model.add(Activation('softmax')) from keras.optimizers import Adam model.compile(loss='categorical_crossentropy', optimizer=Adam(clipnorm=1.), metrics=['accuracy']) # функцию-генератор, порождающую минибатчи один за другим test_indices = np.random.choice(range(len(sentences)), int(len(sentences) * 0.05)) sentences_train = [ sentences[x] for x in set(range(len(sentences))) - set(test_indices) ] sentences_test = [ sentences[x] for x in test_indices] sentences_train = sorted(sentences_train, key = lambda x : len(x)) X_test, y_test = get_matrices(sentences_test) batch_size = 16 def generate_batch(): while True: for i in range( int(len(sentences_train) / batch_size) ): sentences_batch = sentences_train[ i*batch_size : (i+1)*batch_size ] yield get_matrices(sentences_batch) # две стандартные функции обратного вызова from keras.callbacks import ModelCheckpoint, CSVLogger cb_sampler = CharSampler(char_vectors, model) cb_logger = CSVLogger('sin_l/' + model_fname + '.log') # порождение нескольких текстов в виде еще одной функции обратного вызова. from keras.callbacks import Callback class CharSampler(Callback): def __init__(self, char_vectors, model): self.char_vectors = char_vectors self.model = model def on_train_begin(self, logs={}): self.epoch = 0 if os.path.isfile(output_fname): os.remove(output_fname) def sample( self, preds, temperature=1.0): preds = np.asarray(preds).astype('float64') preds = np.log(preds) / temperature exp_preds = np.exp(preds) preds = exp_preds / np.sum(exp_preds) probas = np.random.multinomial(1, preds, 1) return np.argmax(probas) def sample_one(self, T): result = START_CHAR while len(result)<500: Xsampled = np.zeros( (1, len(result), num_chars) ) for t,c in enumerate( list( result ) ): Xsampled[0,t,:] = self.char_vectors[ c ] ysampled = self.model.predict( Xsampled, batch_size=1 )[0,:] yv = ysampled[len(result)-1,:] selected_char = indices_to_chars[ self.sample( yv, T ) ] if selected_char==END_CHAR: break result = result + selected_char return result def on_epoch_end(self, batch, logs={}): self.epoch = self.epoch+1 if self.epoch % 50 == 0: print("\nEpoch %d text sampling:" % self.epoch) with open( output_fname, 'a' ) as outf: outf.write( '\n===== Epoch %d =====\n' % self.epoch ) for T in [0.3, 0.5, 0.7, 0.9, 1.1]: print('\tsampling, T = %.1f...' % T) for _ in range(5): self.model.reset_states() res = self.sample_one(T) outf.write( '\nT = %.1f\n%s\n' % (T, res[1:]) ) model.fit_generator( generate_batch(), int(len(sentences_train) / batch_size) * batch_size, nb_epoch=1000, verbose=True, validation_data = (X_test, y_test), callbacks=[cb_logger, cb_sampler, cb_checkpoint] ) 

Gives out:

 lt1.txt(text/plain) - 442059 bytes, last modified: 27.05.2018 - 100% done Saving lt1.txt to lt1 (2).txt --------------------------------------------------------------------------- KeyError Traceback (most recent call last) <ipython-input-2-4b86f59567ae> in <module>() 111 sentences_test = [ sentences[x] for x in test_indices] 112 sentences_train = sorted(sentences_train, key = lambda x : len(x)) --> 113 X_test, y_test = get_matrices(sentences_test) 114 batch_size = 16 115 def generate_batch(): <ipython-input-2-4b86f59567ae> in get_matrices(sentences) 74 for t in range(max_sentence_len): 75 X[i, t, :] = char_vectors[char_seq[t]] ---> 76 y[i, t, :] = char_vectors[char_seq[t+1]] 77 return X,y 78 KeyError: ' ' 
  • Can no one help? - Filip Sena
  • Well, the easiest way is to do some kind of debug printing there, catching a mistake. See what is equal to t at this moment, what is equal to char_seq [t + 1], which generally is in char_seq and in char_vectors. As I understand it, there is no key in char_vectors '' and it’s so difficult to understand why it’s not there and what it really should be there. - CrazyElf
  • I have the same problem, you did not find the answer? - Vladusha Gusak

0