Hello, I need help with the TFC method of weighting terms. I wrote my function, and it works, but only for small amounts of data. When you try to feed her training sample, she just hangs. Here is the neural network:

import numpy as np import pandas as pd import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from string import punctuation from bs4 import BeautifulSoup import re import itertools from itertools import zip_longest import operator import numpy as np import keras from keras.preprocessing import sequence from keras.models import Sequential from keras.layers import Dense,Activation,Dropout,Conv1D,Flatten,MaxPooling1D,GlobalMaxPooling1D from keras.layers import Embedding from keras import optimizers from keras.datasets import imdb from keras.preprocessing.text import Tokenizer #nltk.download("stopwords") pos_train_data = pd.read_csv('train_pos.tsv',sep = '\t') neg_train_data = pd.read_csv('train_neg.tsv',sep = '\t') pos_test_data = pd.read_csv('test_pos.tsv',sep = '\t') neg_test_data = pd.read_csv('test_neg.tsv',sep = '\t') pos_train_data = pos_train_data[['Text','Sentiment']] neg_train_data = neg_train_data[['Text','Sentiment']] pos_test_data = pos_test_data[['Text','Sentiment']] neg_test_data = neg_test_data[['Text','Sentiment']] data_train = pd.concat([pos_train_data,neg_train_data],ignore_index = True) data_train = data_train.sample(frac=1).reset_index(drop=True) #print(data_train.head()) data_test = pd.concat([pos_test_data,neg_test_data],ignore_index = True) data_test = data_test.sample(frac=1).reset_index(drop=True) #print(data_test.head()) stop_words = set(stopwords.words('english')) table = str.maketrans('', '', punctuation) def textclean(text): #tokens = word_tokenize(text) tokens = (text.lower()).split() tokens = [word for word in tokens if word.isalpha()] tokens = [w.translate(table) for w in tokens] tokens = [word for word in tokens if not word in stop_words] tokens = [word for word in tokens if len(word) > 1] return tokens def review_to_words(text): clean_text = BeautifulSoup(text, "html5lib").get_text() clean_text = re.sub(r"[^a-zA-Z]", " ", clean_text) words = (clean_text.lower()).split() words = [w for w in words if w not in stopwords.words("english")] return words reviews = [] for index,row in data_train.iterrows(): text = (row['Text'].lower()) reviews.append(textclean(text)) linked_reviews = list(itertools.chain.from_iterable(reviews)) #print(len(linked_reviews)) vocab_freq = dict() #print(linked_reviews[1]) for word in linked_reviews: if word not in vocab_freq: vocab_freq[word] = 1 else: vocab_freq[word] += 1 sorted_vocab_freq = list(reversed(sorted(vocab_freq.items(), key=operator.itemgetter(1)))) #print(sorted_vocab_freq) #print(len(sorted_vocab_freq)) TOTAL_VOCAB = 5000 word_to_id = dict() id_to_word = dict() for i in range(TOTAL_VOCAB): word_to_id[sorted_vocab_freq[i][0]] = i id_to_word[i] = sorted_vocab_freq[i][0] print(id_to_word[1]) #review_lengths review_lengths = pd.DataFrame([len(review) for review in reviews]) review_lengths.columns = ['Len'] #print(review_lengths) #stats #print(review_lengths.describe()) def convert(l): new_l = [] for word in l: if word in word_to_id: new_l.append(word_to_id[word]) return new_l #print(len(data_train['Sentiment'])) X_train = [] y_train = [] #Tukey's method first_q = review_lengths.Len.quantile([0.25])[0.25] third_q = review_lengths.Len.quantile([0.75])[0.75] upper_threshold = third_q + 1.5*(third_q-first_q) lower_threshold = first_q - 1.5*(third_q-first_q) #print(upper_threshold,lower_threshold) for i in range(len(data_train)): converted_review = convert(reviews[i]) if len(converted_review) <= 250: X_train.append(converted_review) y_train.append(data_train['Sentiment'][i]) X_train = np.array(X_train) y_train = np.array(y_train) #print(X_train) #print(y_train) X_train = sequence.pad_sequences(X_train, maxlen=int(upper_threshold),value = 0) #print(X_train.shape,y_train.shape) data_test = pd.concat([pos_test_data,pos_test_data, neg_test_data], ignore_index=True) data_test = data_test.sample(frac=0.3).reset_index(drop=True) #print(data_test) #print(pos_test_data) validation_reviews = [] for index, row in data_test.iterrows(): text = (row['Text'].lower()) validation_reviews.append(textclean(text)) X_val = [] y_val = [] for i in range(len(data_test)): converted_review = convert(validation_reviews[i]) if len(converted_review) <= upper_threshold: X_val.append(converted_review) y_val.append(data_test['Sentiment'][i]) X_val = np.array(X_val) X_val = sequence.pad_sequences(X_val, maxlen=int(upper_threshold), value=0) #print(X_val) y_val = np.array(y_val) #print(X_train) #print(X_train.shape) #print(y_train) EMBEDDING_LEN = 32 model = Sequential() model.add(Embedding(TOTAL_VOCAB,EMBEDDING_LEN,input_length = 205)) model.add(Conv1D(256,3,padding = 'same')) model.add(Dropout(0.25)) model.add(Conv1D(128, 3,padding = 'same')) model.add(Dropout(0.25)) model.add(Conv1D(32,2,padding = 'same')) model.add(Dropout(0.25)) model.add(MaxPooling1D(pool_length=4)) model.add(Conv1D(16,2,padding = 'same')) model.add(Dropout(0.25)) model.add(Flatten()) model.add(Dropout(0.25)) model.add(Dense(100,activation = 'relu')) model.add(Dropout(0.2)) model.add(Dense(1,activation='sigmoid')) model.summary() opt = optimizers.Adam(lr=0.2, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) model.compile(loss = 'binary_crossentropy',optimizer = opt ,metrics = ['accuracy']) model.fit(X_train,y_train,validation_data = (X_val,y_val),epochs = 1 ,batch_size = 32) 

And this is a function

 def tfc(slova): import math as m meshokslov1=[{}for i in range(len(slova))] meshokslov2=[{}for i in range(len(slova))] SummaKvadratov=0 for i in range(len(slova)): for j in range(len(slova[i])): n=0 for q in slova: if q.count(slova[i][j])!=0: n+=1 if slova[i][j] in meshokslov1: continue else: meshokslov1[i][slova[i][j]]=slova[i].count(slova[i][j])*m.log10(len(slova)/n) SummaKvadratov+=(slova[i].count(slova[i][j])*m.log10(len(slova)/n))**2 for i in range(len(slova)): for j in range(len(slova[i])): if slova[i][j] in meshokslov2: continue else: meshokslov2[i][slova[i][j]]=meshokslov1[i][slova[i][j]]/(SummaKvadratov**0.5) return meshokslov2 
  • Comments are not intended for extended discussion; conversation moved to chat . - Yuriy SPb

2 answers 2

To calculate TF-IDF, you can use a ready-made solution - sklearn.feature_extraction.text.TfidfVectorizer .

Example:

 In [145]: from sklearn.feature_extraction.text import TfidfVectorizer In [146]: filename = r'D:\download\aclImdb\train\train_df.csv' In [147]: df = pd.read_csv(filename) In [148]: df.shape Out[148]: (25000, 2) In [149]: df.head() Out[149]: review sentiment 0 Bromwell High is a cartoon comedy. It ran at t... 1 1 Homelessness (or Houselessness as George Carli... 1 2 Brilliant over-acting by Lesley Ann Warren. Be... 1 3 This is easily the most underrated film inn th... 1 4 This is not the typical Mel Brooks film. It wa... 1 In [150]: vect = TfidfVectorizer() In [151]: X = vect.fit_transform(df['review']) 

As a result, we received a sparse matrix of dimension 25000x74849 (2500 documents, 74849 signs / words):

 In [152]: X Out[152]: <25000x74849 sparse matrix of type '<class 'numpy.float64'>' with 3445861 stored elements in Compressed Sparse Row format> 

    If TFC is calculated according to the following formula (page 5) :

    enter image description here


    then you can try to implement it in the following vectorized way:

     import scipy.sparse as sparse def tfc(freqs_arr): if sparse.isspmatrix_csr(freqs_arr): v = freqs_arr else: v = sparse.csr_matrix(freqs_arr) l = sparse.csc_matrix(np.log10(v.shape[0]/(v > 0).sum(axis=0)).reshape(-1)) v = v.multiply(l) denom = v.multiply(v).sum() ** 0.5 return v / denom 

    where freqs_arr is the matrix of frequency occurrences of normalized words.

    Example:

     In [128]: from sklearn.feature_extraction.text import CountVectorizer 

    By default, single-letter tokens (words) are ignored, so we specify the token_pattern explicitly:

     In [129]: cv = CountVectorizer(token_pattern='(?u)\\b\\w+\\b') 

    source documents:

     In [130]: data = ['Help, help, I need help!', 'I will help', 'Will you?'] 

    CountVectorizer frequency matrix ( CountVectorizer normalizes the default text):

     In [131]: X = cv.fit_transform(data) 

    to see the result of the vectorization (word entry frequency) in the form of a SparseDataFrame:

     In [132]: d = pd.SparseDataFrame(X, columns=cv.get_feature_names(), default_fill_value=0) In [133]: d Out[133]: help i need will you 0 3 1 1 0 0 1 1 1 0 1 0 2 0 0 0 1 1 

    TFC:

     In [302]: tfc(X) Out[302]: <3x5 sparse matrix of type '<class 'numpy.float64'>' with 8 stored elements in Compressed Sparse Row format> 

    as a normal (dense) matrix (to display as a matrix):

     In [303]: tfc(X).A Out[303]: array([[0.56015692, 0.18671897, 0.50591716, 0. , 0. ], [0.18671897, 0.18671897, 0. , 0.18671897, 0. ], [0. , 0. , 0. , 0.18671897, 0.50591716]]) 

    Visualization in the form of SparseDataFrame:

     In [306]: pd.SparseDataFrame(tfc(X), columns=cv.get_feature_names(), default_fill_value=0) Out[306]: help i need will you 0 0.560157 0.186719 0.505917 0.000000 0.000000 1 0.186719 0.186719 0.000000 0.186719 0.000000 2 0.000000 0.000000 0.000000 0.186719 0.505917 

    PS I cannot vouch for the correct implementation of the formula - it is necessary to check

    • only now how to drive a training sample through it, which lies in the reviews - Midnight
    • With regards to my function, this is what gives if you submit your data doc = [['Help', 'help', 'I', 'need', 'help'], ['I', 'will', 'help'], [ 'Will', 'you' ]] And conclusion: [{ 'Help': 0.23905705128951346 , 'help': 0.17645768965980294, 'I': 0.08822884482990147, 'need': 0.23905705128951346}, { 'I': 0.08822884482990147, 'will ': 0.23905705128951346,' help ': 0.08822884482990147}, {' Will ': 0.23905705128951346,' you ': 0.23905705128951346}] - Midnight
    • And what did you say on TF-IDF, how can it be implemented? - Midnight
    • how can you implement your method with tfc calculation, but only so that the separator is not a comma, but a sub-list, that is, the data does not look like data = ['Help, help, I need help!', 'I will help', 'Will you? '] And so data = [[' Help ',' help ',' I ',' need ',' help '], [' I ',' will ',' help '], [' Will ', 'you' ']] - Midnight
    • @Midnight, the easiest option is to convert to this view as in the answer: data = [' '.join(x) for x in data] - MaxU