There is a database with comments. need to find the top most frequent in all comments. All that at the moment has been done is to find the number of meetings of a specific word, which is specified in the code itself. Is it possible to somehow implement such a search, while being universal for any set of comments / texts / something else in Russian?

import pandas as pd import sys import pymysql import numpy as np import nltk from nltk.corpus import state_union from nltk.tokenize import word_tokenize from nltk.corpus import stopwords #ΠΏΠΎΠ΄ΠΊΠ»ΡŽΡ‡Π°ΡŽΡΡŒ ΠΊ Π±Π°Π·Π΅ Π΄Π°Π½Π½Ρ‹Ρ…, Π·Π°ΠΏΠΈΡΡ‹Π²Π°ΡŽ ΠΊΠΎΠΌΠΌΠ΅Π½Ρ‚Π°Ρ€ΠΈΠΈ Π² DataFrame db = pymysql.connect(host='localhost', user='root', passwd='', database='mom_db', charset='utf8') df = pd.read_sql("SELECT comm2 FROM comments ", db) #Ѐункция ΡƒΠ±ΠΈΡ€Π°Π΅Ρ‚ лишниС символы ΠΏΠΎ Π³Ρ€Π°Π½ΠΈΡ†Π΅ тСкста,Ρ‚ΠΎΡ‡ΠΊΠΈ , запятыС ΠΈ ΠΏΡ€ΠΎΡ‡. def delete_chars(str): str = str.lstrip() str = str.rstrip() str = str.replace("."," ") str = str.replace(","," ") str = str.replace("-"," ") str = str.replace("?"," ") str = str.replace("!"," ") str = str.replace(")"," ") str = str.replace("("," ") str = str.replace("..."," ") str = str.replace("β€”"," ") str = str.replace(":"," ") str = str.replace("<"," ") str = str.replace(">"," ") str = str.replace("/"," ") str = str.replace("``"," ") str = str.replace("'"," ") str = str.replace("Β«", " ") str = str.replace("Β»", " ") str = str.replace(";", " ") str = str.lower() return str df['comm2'] = df['comm2'].apply(delete_chars) st_w = set(stopwords.words('russian')) words_filtered = [] i = 0 for df['comm2'][i] in df['comm2']: df_tok = word_tokenize(df['comm2'][i]) i = i + 1 for w in df_tok: if w not in st_w: words_filtered.append(w) print(words_filtered) count = words_filtered.count('Π•Π²Ρ€ΠΎΠΏΠ΅') print(count) 

PS can there be a more optimal solution instead of the function delete_chars?

An example of the required output:

 "слово_1" 33 "слово_2" 22 "слово_3" 11 
  • Pay attention to the function plot_word_cloud() from this answer I did there practically the same as you want to do ... - MaxU

2 answers 2

It is possible so:

 import os import re import pandas as pd import nltk from nltk.corpus import stopwords from nltk import word_tokenize, sent_tokenize, FreqDist from collections import defaultdict, Counter import string import requests def get_text(url, encoding='utf-8', to_lower=True): url = str(url) if url.startswith('http'): r = requests.get(url) if not r.ok: r.raise_for_status() return r.text.casefold() if to_lower else r.text elif os.path.exists(url): with open(url, encoding=encoding) as f: return f.read().casefold() if to_lower else f.read() else: raise Exception('parameter [url] must be either URL or a filename') def mk_trans_tab(chars2remove): return str.maketrans(dict(zip(chars2remove, list(' ' * len(chars2remove))))) # Π“ΠΎΠ³ΠΎΠ»ΡŒ Н. Π’. "Π Π΅Π²ΠΈΠ·ΠΎΡ€" ( http://lifeinbooks.net/chto-pochitat/nikolaj-gogol-revizor-vot-tak-i-po-sej-den/ ) url = r'C:\download\Gogol_N._Revizor.txt' stop_words = set(stopwords.words('russian')) transl_tab = mk_trans_tab(list(string.punctuation) + list('\r\n«»\–')) text = get_text(url, encoding='cp1251') df = pd.DataFrame({'comm':re.split(r'[\n\r\.\?!]', text)}) df['comm'] = df['comm'].str.translate(transl_tab).str.lower() words = [w for w in word_tokenize(df['comm'].str.cat(sep=' ')) if w not in stop_words] fdist = FreqDist(words) print(fdist.most_common(20)) 

Conclusion:

 [('хлСстаков', 244), ('Π³ΠΎΡ€ΠΎΠ΄Π½ΠΈΡ‡ΠΈΠΉ', 193), ('это', 155), ('Π°Π½Π½Π°', 127), ('Π°Π½Π΄Ρ€Π΅Π΅Π²Π½Π°', 125), ('осип', 78), ('ΠΌΠ°Ρ€ΡŒΡ', 65), ('Π°Π½Ρ‚ΠΎΠ½ΠΎΠ²Π½Π°', 64), ('добчинский', 63), ('аммос', 60), ('Π³ΠΎΠ²ΠΎΡ€ΠΈΡ‚', 60), ('Ρ„Π΅Π΄ΠΎΡ€ΠΎΠ²ΠΈΡ‡', 59), ('Π°Ρ€Ρ‚Π΅ΠΌΠΈΠΉ', 58), ('Ρ„ΠΈΠ»ΠΈΠΏΠΏΠΎΠ²ΠΈΡ‡', 58), ('бобчинский', 58), ('почтмСйстСр', 55), ('ΠΎΡ‡Π΅Π½ΡŒ', 51), ('явлСниС', 51), ('Ρ‚Π΅Π±Π΅', 50), ('ΠΈΠ²Π°Π½ΠΎΠ²ΠΈΡ‡', 42)] 

It may be useful to someone and my crooked implementation)) + screwed here also the reduction of words to the normal (initial) form

 import pandas as pd import sys import pymysql import numpy as np import nltk from nltk.corpus import state_union from nltk import word_tokenize, FreqDist from nltk.corpus import stopwords import string import pymorphy2 #ΠΏΠΎΠ΄ΠΊΠ»ΡŽΡ‡Π°ΡŽΡΡŒ ΠΊ Π±Π°Π·Π΅ Π΄Π°Π½Π½Ρ‹Ρ…, Π·Π°ΠΏΠΈΡΡ‹Π²Π°ΡŽ ΠΊΠΎΠΌΠΌΠ΅Π½Ρ‚Π°Ρ€ΠΈΠΈ Π² DataFrame db = pymysql.connect(host='localhost', user='root', passwd='', database='mom_db', charset='utf8') df = pd.read_sql("SELECT comm2 FROM comments ", db) #Ѐункция ΡƒΠ±ΠΈΡ€Π°Π΅Ρ‚ лишниС символы ΠΏΠΎ Π³Ρ€Π°Π½ΠΈΡ†Π΅ тСкста,Ρ‚ΠΎΡ‡ΠΊΠΈ , запятыС ΠΈ ΠΏΡ€ΠΎΡ‡. def mk_trans_tab(chars2remove): return str.maketrans(dict(zip(chars2remove, list(' ' * len(chars2remove))))) transl_tab = mk_trans_tab(list(string.punctuation) + list('\r\n«»\–')) df['comm2'] = df['comm2'].str.translate(transl_tab).str.lower() st_w = set(stopwords.words('russian')) words_filtered = [] i = 0 for df['comm2'][i] in df['comm2']: df_tok = word_tokenize(df['comm2'][i]) i = i + 1 for w in df_tok: if w not in st_w: parsed_words = morph.parse(w)[0] normalized_words = parsed_words.normal_form words_filtered.append(normalized_words) count = FreqDist(words_filtered) print(count.most_common(20)) 

The conclusion I got was this (which I personally was completely satisfied with):

 [('Π³Π°Π·', 111), ('россия', 110), ('Π΅Π²Ρ€ΠΎΠΏΠ°', 97), ('Π³Π°Π·Π°', 87), ('ΡƒΠΊΡ€Π°ΠΈΠ½Π°', 70), ('Π³Π°Π·ΠΏΡ€ΠΎΠΌ', 66), ('Ρ†Π΅Π½Π°', 60), ('страна', 49), ('российский', 45), ('Π±ΡŽΠ΄ΠΆΠ΅Ρ‚', 41), ('ΠΏΠΎΠΊΡƒΠΏΠ°Ρ‚ΡŒ', 37), ('сша', 33), ('Π²Π²ΠΏ', 33), ('Сс', 31), ('Π΄ΠΎΡ…ΠΎΠ΄', 31), ('ΠΊΠ°ΠΊΠΎΠΉ', 30), ('Ρ€Ρ„', 30), ('дСньга', 28), ('2', 28), ('Π½Π΅Ρ„Ρ‚ΡŒ', 25)]