for doc, category in zip(docs_new, predicted): print('%r => %s' % (doc, twenty_train.target_names[category])) How to find out with what probability the algorithm determined that the text belongs to this group?
Here is the full code:
from sklearn.datasets import load_files categories = ['first', 'second', 'third'] twenty_train = load_files('db', categories=categories, shuffle=False, encoding='utf-8') from sklearn.feature_extraction.text import CountVectorizer count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(twenty_train.data) from sklearn.feature_extraction.text import TfidfTransformer tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) print(X_train_tfidf.shape) from sklearn.naive_bayes import MultinomialNB clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target) docs_new = [str1, str2] X_new_counts = count_vect.transform(docs_new) X_new_tfidf = tfidf_transformer.transform(X_new_counts) predicted = clf.predict(X_new_tfidf) for doc, category in zip(docs_new, predicted): print('%r => %s' % (doc, twenty_train.target_names[category]))
docs_new? What doestwenty_trainlooktwenty_train? - MaxU