I try on articles from Habr to deal with text processing in scikit-learn If you take a test sample, everything works fine. But I tried to load my database and all texts are categorized as 'first' . What am I doing wrong. And immediately the next question is whether it is possible to show the probability of the relationship of the text to this class.
from sklearn.datasets import load_files from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.naive_bayes import MultinomialNB categories = ['first', 'second', 'third'] a = load_files('db', encoding='utf-8', categories=categories) count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(a.data) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) clf = MultinomialNB().fit(X_train_tfidf, a.target) docs_new = ['God is love', 'OpenGL on the GPU is fast'] X_new_counts = count_vect.transform(docs_new) X_new_tfidf = tfidf_transformer.transform(X_new_counts) predicted = clf.predict(X_new_tfidf) for doc, category in zip(docs_new, predicted): print('%r => %s' % (doc, a.target_names[category]))