Find the top N most frequently used words in the comments.

Question

There is a database with comments. need to find the top most frequent in all comments. All that at the moment has been done is to find the number of meetings of a specific word, which is specified in the code itself. Is it possible to somehow implement such a search, while being universal for any set of comments / texts / something else in Russian?

import pandas as pd import sys import pymysql import numpy as np import nltk from nltk.corpus import state_union from nltk.tokenize import word_tokenize from nltk.corpus import stopwords #подключаюсь к базе данных, записываю комментарии в DataFrame db = pymysql.connect(host='localhost', user='root', passwd='', database='mom_db', charset='utf8') df = pd.read_sql("SELECT comm2 FROM comments ", db) #Функция убирает лишние символы по границе текста,точки , запятые и проч. def delete_chars(str): str = str.lstrip() str = str.rstrip() str = str.replace("."," ") str = str.replace(","," ") str = str.replace("-"," ") str = str.replace("?"," ") str = str.replace("!"," ") str = str.replace(")"," ") str = str.replace("("," ") str = str.replace("..."," ") str = str.replace("—"," ") str = str.replace(":"," ") str = str.replace("<"," ") str = str.replace(">"," ") str = str.replace("/"," ") str = str.replace("``"," ") str = str.replace("'"," ") str = str.replace("«", " ") str = str.replace("»", " ") str = str.replace(";", " ") str = str.lower() return str df['comm2'] = df['comm2'].apply(delete_chars) st_w = set(stopwords.words('russian')) words_filtered = [] i = 0 for df['comm2'][i] in df['comm2']: df_tok = word_tokenize(df['comm2'][i]) i = i + 1 for w in df_tok: if w not in st_w: words_filtered.append(w) print(words_filtered) count = words_filtered.count('Европе') print(count)

PS can there be a more optimal solution instead of the function delete_chars?

An example of the required output:

 "слово_1" 33 "слово_2" 22 "слово_3" 11

Pay attention to the function plot_word_cloud() from this answer I did there practically the same as you want to do ...

Accepted Answer · 2018-03-06T14:43:13

It is possible so:

 import os import re import pandas as pd import nltk from nltk.corpus import stopwords from nltk import word_tokenize, sent_tokenize, FreqDist from collections import defaultdict, Counter import string import requests def get_text(url, encoding='utf-8', to_lower=True): url = str(url) if url.startswith('http'): r = requests.get(url) if not r.ok: r.raise_for_status() return r.text.casefold() if to_lower else r.text elif os.path.exists(url): with open(url, encoding=encoding) as f: return f.read().casefold() if to_lower else f.read() else: raise Exception('parameter [url] must be either URL or a filename') def mk_trans_tab(chars2remove): return str.maketrans(dict(zip(chars2remove, list(' ' * len(chars2remove))))) # Гоголь Н. В. "Ревизор" ( http://lifeinbooks.net/chto-pochitat/nikolaj-gogol-revizor-vot-tak-i-po-sej-den/ ) url = r'C:\download\Gogol_N._Revizor.txt' stop_words = set(stopwords.words('russian')) transl_tab = mk_trans_tab(list(string.punctuation) + list('\r\n«»\–')) text = get_text(url, encoding='cp1251') df = pd.DataFrame({'comm':re.split(r'[\n\r\.\?!]', text)}) df['comm'] = df['comm'].str.translate(transl_tab).str.lower() words = [w for w in word_tokenize(df['comm'].str.cat(sep=' ')) if w not in stop_words] fdist = FreqDist(words) print(fdist.most_common(20))

Conclusion:

 [('хлестаков', 244), ('городничий', 193), ('это', 155), ('анна', 127), ('андреевна', 125), ('осип', 78), ('марья', 65), ('антоновна', 64), ('добчинский', 63), ('аммос', 60), ('говорит', 60), ('федорович', 59), ('артемий', 58), ('филиппович', 58), ('бобчинский', 58), ('почтмейстер', 55), ('очень', 51), ('явление', 51), ('тебе', 50), ('иванович', 42)]

for case-insensitive comparison, it is better to use casefold () instead of lower () .

lynx lynx 254 one sixteen · Answer 2 · 2018-03-07T13:03:24

It may be useful to someone and my crooked implementation)) + screwed here also the reduction of words to the normal (initial) form

 import pandas as pd import sys import pymysql import numpy as np import nltk from nltk.corpus import state_union from nltk import word_tokenize, FreqDist from nltk.corpus import stopwords import string import pymorphy2 #подключаюсь к базе данных, записываю комментарии в DataFrame db = pymysql.connect(host='localhost', user='root', passwd='', database='mom_db', charset='utf8') df = pd.read_sql("SELECT comm2 FROM comments ", db) #Функция убирает лишние символы по границе текста,точки , запятые и проч. def mk_trans_tab(chars2remove): return str.maketrans(dict(zip(chars2remove, list(' ' * len(chars2remove))))) transl_tab = mk_trans_tab(list(string.punctuation) + list('\r\n«»\–')) df['comm2'] = df['comm2'].str.translate(transl_tab).str.lower() st_w = set(stopwords.words('russian')) words_filtered = [] i = 0 for df['comm2'][i] in df['comm2']: df_tok = word_tokenize(df['comm2'][i]) i = i + 1 for w in df_tok: if w not in st_w: parsed_words = morph.parse(w)[0] normalized_words = parsed_words.normal_form words_filtered.append(normalized_words) count = FreqDist(words_filtered) print(count.most_common(20))

The conclusion I got was this (which I personally was completely satisfied with):

 [('газ', 111), ('россия', 110), ('европа', 97), ('газа', 87), ('украина', 70), ('газпром', 66), ('цена', 60), ('страна', 49), ('российский', 45), ('бюджет', 41), ('покупать', 37), ('сша', 33), ('ввп', 33), ('ес', 31), ('доход', 31), ('какой', 30), ('рф', 30), ('деньга', 28), ('2', 28), ('нефть', 25)]

Find the top N most frequently used words in the comments.

2 answers 2

More articles: