Hello, I need help with the TFC method of weighting terms. I wrote my function, and it works, but only for small amounts of data. When you try to feed her training sample, she just hangs. Here is the neural network:
import numpy as np import pandas as pd import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from string import punctuation from bs4 import BeautifulSoup import re import itertools from itertools import zip_longest import operator import numpy as np import keras from keras.preprocessing import sequence from keras.models import Sequential from keras.layers import Dense,Activation,Dropout,Conv1D,Flatten,MaxPooling1D,GlobalMaxPooling1D from keras.layers import Embedding from keras import optimizers from keras.datasets import imdb from keras.preprocessing.text import Tokenizer #nltk.download("stopwords") pos_train_data = pd.read_csv('train_pos.tsv',sep = '\t') neg_train_data = pd.read_csv('train_neg.tsv',sep = '\t') pos_test_data = pd.read_csv('test_pos.tsv',sep = '\t') neg_test_data = pd.read_csv('test_neg.tsv',sep = '\t') pos_train_data = pos_train_data[['Text','Sentiment']] neg_train_data = neg_train_data[['Text','Sentiment']] pos_test_data = pos_test_data[['Text','Sentiment']] neg_test_data = neg_test_data[['Text','Sentiment']] data_train = pd.concat([pos_train_data,neg_train_data],ignore_index = True) data_train = data_train.sample(frac=1).reset_index(drop=True) #print(data_train.head()) data_test = pd.concat([pos_test_data,neg_test_data],ignore_index = True) data_test = data_test.sample(frac=1).reset_index(drop=True) #print(data_test.head()) stop_words = set(stopwords.words('english')) table = str.maketrans('', '', punctuation) def textclean(text): #tokens = word_tokenize(text) tokens = (text.lower()).split() tokens = [word for word in tokens if word.isalpha()] tokens = [w.translate(table) for w in tokens] tokens = [word for word in tokens if not word in stop_words] tokens = [word for word in tokens if len(word) > 1] return tokens def review_to_words(text): clean_text = BeautifulSoup(text, "html5lib").get_text() clean_text = re.sub(r"[^a-zA-Z]", " ", clean_text) words = (clean_text.lower()).split() words = [w for w in words if w not in stopwords.words("english")] return words reviews = [] for index,row in data_train.iterrows(): text = (row['Text'].lower()) reviews.append(textclean(text)) linked_reviews = list(itertools.chain.from_iterable(reviews)) #print(len(linked_reviews)) vocab_freq = dict() #print(linked_reviews[1]) for word in linked_reviews: if word not in vocab_freq: vocab_freq[word] = 1 else: vocab_freq[word] += 1 sorted_vocab_freq = list(reversed(sorted(vocab_freq.items(), key=operator.itemgetter(1)))) #print(sorted_vocab_freq) #print(len(sorted_vocab_freq)) TOTAL_VOCAB = 5000 word_to_id = dict() id_to_word = dict() for i in range(TOTAL_VOCAB): word_to_id[sorted_vocab_freq[i][0]] = i id_to_word[i] = sorted_vocab_freq[i][0] print(id_to_word[1]) #review_lengths review_lengths = pd.DataFrame([len(review) for review in reviews]) review_lengths.columns = ['Len'] #print(review_lengths) #stats #print(review_lengths.describe()) def convert(l): new_l = [] for word in l: if word in word_to_id: new_l.append(word_to_id[word]) return new_l #print(len(data_train['Sentiment'])) X_train = [] y_train = [] #Tukey's method first_q = review_lengths.Len.quantile([0.25])[0.25] third_q = review_lengths.Len.quantile([0.75])[0.75] upper_threshold = third_q + 1.5*(third_q-first_q) lower_threshold = first_q - 1.5*(third_q-first_q) #print(upper_threshold,lower_threshold) for i in range(len(data_train)): converted_review = convert(reviews[i]) if len(converted_review) <= 250: X_train.append(converted_review) y_train.append(data_train['Sentiment'][i]) X_train = np.array(X_train) y_train = np.array(y_train) #print(X_train) #print(y_train) X_train = sequence.pad_sequences(X_train, maxlen=int(upper_threshold),value = 0) #print(X_train.shape,y_train.shape) data_test = pd.concat([pos_test_data,pos_test_data, neg_test_data], ignore_index=True) data_test = data_test.sample(frac=0.3).reset_index(drop=True) #print(data_test) #print(pos_test_data) validation_reviews = [] for index, row in data_test.iterrows(): text = (row['Text'].lower()) validation_reviews.append(textclean(text)) X_val = [] y_val = [] for i in range(len(data_test)): converted_review = convert(validation_reviews[i]) if len(converted_review) <= upper_threshold: X_val.append(converted_review) y_val.append(data_test['Sentiment'][i]) X_val = np.array(X_val) X_val = sequence.pad_sequences(X_val, maxlen=int(upper_threshold), value=0) #print(X_val) y_val = np.array(y_val) #print(X_train) #print(X_train.shape) #print(y_train) EMBEDDING_LEN = 32 model = Sequential() model.add(Embedding(TOTAL_VOCAB,EMBEDDING_LEN,input_length = 205)) model.add(Conv1D(256,3,padding = 'same')) model.add(Dropout(0.25)) model.add(Conv1D(128, 3,padding = 'same')) model.add(Dropout(0.25)) model.add(Conv1D(32,2,padding = 'same')) model.add(Dropout(0.25)) model.add(MaxPooling1D(pool_length=4)) model.add(Conv1D(16,2,padding = 'same')) model.add(Dropout(0.25)) model.add(Flatten()) model.add(Dropout(0.25)) model.add(Dense(100,activation = 'relu')) model.add(Dropout(0.2)) model.add(Dense(1,activation='sigmoid')) model.summary() opt = optimizers.Adam(lr=0.2, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) model.compile(loss = 'binary_crossentropy',optimizer = opt ,metrics = ['accuracy']) model.fit(X_train,y_train,validation_data = (X_val,y_val),epochs = 1 ,batch_size = 32) And this is a function
def tfc(slova): import math as m meshokslov1=[{}for i in range(len(slova))] meshokslov2=[{}for i in range(len(slova))] SummaKvadratov=0 for i in range(len(slova)): for j in range(len(slova[i])): n=0 for q in slova: if q.count(slova[i][j])!=0: n+=1 if slova[i][j] in meshokslov1: continue else: meshokslov1[i][slova[i][j]]=slova[i].count(slova[i][j])*m.log10(len(slova)/n) SummaKvadratov+=(slova[i].count(slova[i][j])*m.log10(len(slova)/n))**2 for i in range(len(slova)): for j in range(len(slova[i])): if slova[i][j] in meshokslov2: continue else: meshokslov2[i][slova[i][j]]=meshokslov1[i][slova[i][j]]/(SummaKvadratov**0.5) return meshokslov2 