Algorithms for solving the problem of classification

Question

They gave such a task at the interview, as a DZ, a vacancy IT analyst. You can view the solutions below in the comments.

1st half columns

2nd half (the whole excel spreadsheet is 5658 x 58)

Directly, the task:

reference to the task file and data: http://yadi.sk/i/gDtn6k03w3cqWw

you need to find two sums - why do they call it a classification?
This is a library for the language R. This is not at all necessary.
I can only use Python, I don’t know much about machine learning, I started reading the book "Andreas Muller. An Introduction to Machine Learning Using Python".
I would use Keras / Tensorflow / PyTorch well, or in the simplest case, Sklearn.

MaxU MaxU 52.3k 6 18 51 · Accepted Answer · 2018-09-27T22:24:12

You can also use deeper convolutional neural networks (Convolutional Neural Networks). After 100 epochs of training, this model showed the best result - 97.17% correct predictions on an unfamiliar sample of data:

In [34]: clf.evaluate(X_test, Y_test) 566/566 [==============================] - 1s 2ms/step Out[34]: [0.08433653733465991, 0.9717314487632509] Normalized confusion matrix [[0.967 0.033] [0.014 0.986]]

Example:

 def _conv1d(model, name_suffix=1, filters=32, kernel_size=(3), padding='same', use_bias=True, max_pool_size=None, **kwargs): model.add(Conv1D(filters=filters, kernel_size=kernel_size, padding=padding, use_bias=use_bias, name=f'conv_{name_suffix}', **kwargs)) model.add(BatchNormalization(name=f'norm_{name_suffix}')) model.add(LeakyReLU(alpha=0.1)) if max_pool_size is not None: model.add(MaxPool1D(pool_size=max_pool_size)) return model def train_CNN_clf(X, Y, validation_split=0.1, input_shape=(49,1), model_fn=None, lr=0.01, epochs=100, batch_size=32, patience=30, verbose=1): # building a prediction model model = Sequential(name='CNN') model = _conv1d(model, 1, 32, 5, 'same', max_pool_size=3, input_shape=input_shape) model = _conv1d(model, 2, 64, 5, 'same', max_pool_size=3) model = _conv1d(model, 3, 128, 5, 'same', max_pool_size=None) model = _conv1d(model, 4, 64, 5, 'same', max_pool_size=None) model = _conv1d(model, 5, 128, 5, 'same', max_pool_size=3) model = _conv1d(model, 6, 256, 5, 'same', max_pool_size=None) model = _conv1d(model, 7, 64, 5, 'same', max_pool_size=None) model.add(Flatten()) model.add(Dense(64, activation='relu', name='dense_8')) model.add(Dropout(0.2)) model.add(Dense(64, activation='relu', name='dense_9')) model.add(Dropout(0.2)) model.add(Dense(1, activation='sigmoid', name='dense_out')) model.compile(optimizer=Nadam(lr=lr), loss='binary_crossentropy', metrics=['acc']) print(model.summary()) # Keras callbacks... early_stop = EarlyStopping(monitor='val_acc', min_delta=0.001, mode='auto', patience=min(patience, epochs), verbose=1) if model_fn is None: model_fn = str(Path(data_fn).parent / 'model_cnn.h5') chkpt = ModelCheckpoint(str(model_fn), monitor='val_acc', verbose=1, save_best_only=True, mode='auto') # training the model clf_hist = model.fit(X, Y, validation_split=validation_split, epochs=epochs, batch_size=batch_size, callbacks=[early_stop, chkpt]) return load_model(model_fn) def main(): work_dir = Path(r'D:\Work\ML\Classification\SO.885958-Keras_classification') data_fn = work_dir / 'ZADANIE.xlsx' model_fn = work_dir / 'model_cnn.h5' EPOCHS = 50 BATCH_SIZE = 64 # get and normalize data X, Y = get_data(data_fn) X = np.expand_dims(X, axis=-1) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1) clf = train_CNN_clf(X_train, Y_train, validation_split=0.1, model_fn=model_fn, lr=0.05, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1) Y_pred = clf.predict_classes(X_test).ravel() # Compute confusion matrix class_names = ['OK', 'RISK'] cnf_matrix = confusion_matrix(Y_test, Y_pred) np.set_printoptions(precision=3) # Plot normalized confusion matrix plt.figure() plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Normalized confusion matrix') plt.savefig(str(Path(data_fn).parent / 'confusion_matrix_CNN.png')) #plt.show() #loss, acc = clf.evaluate(X_test, Y_test)

PS the remaining functions are the same as in this answer .

Answer 2 · 2018-09-27T17:01:49

I proceeded from the fact that this is a classification task and it is necessary to predict the last column "Risk".

Here's what I got:

 Normalized confusion matrix [[0.972 0.028] [0.047 0.953]]

Code:

 import os import itertools import numpy as np import pandas as pd try: from pathlib import Path except ImportError: from pathlib2 import Path from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, MinMaxScaler from sklearn.metrics import confusion_matrix try: from sklearn.impute import SimpleImputer as Imputer except ImportError: from sklearn.preprocessing.imputation import Imputer from keras import Sequential from keras.layers import * from keras.optimizers import Adam, Nadam from keras.regularizers import l1, l2 from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard from keras.models import load_model import matplotlib.pyplot as plt ###################### def get_data(filename, target_col_name='Риск', impute=True, normalize=True): df = pd.read_excel(filename, index_col=0) X, Y = df.iloc[:, :54], df[target_col_name] # get rid of columns with a single unique value (all rows have the same value) X = X.loc[:, X.nunique() > 1] # get list of binary and numeric columns bin_cols = X.columns[X.nunique() == 2] num_cols = X.columns.difference(bin_cols) if impute: # imputing missing data (NaN's) imp = Imputer() X = pd.DataFrame(imp.fit_transform(X), columns=X.columns, index=X.index) if normalize: # normalizing data scaler = StandardScaler() X[num_cols] = pd.DataFrame(scaler.fit_transform(X[num_cols]), columns=num_cols, index=X.index) return X, Y def train_classifier(X, Y, validation_split=0.1, model_fn=None, lr=0.01, epochs=100, batch_size=32, patience=30, verbose=1): # building a prediction model model = Sequential() model.add(Dense(64, activation='relu', input_shape=(49,), name='dense_1')) model.add(BatchNormalization(name='norm_1')) model.add(Dense(64, activation='relu', name='dense_2')) model.add(BatchNormalization(name='norm_2')) model.add(Dropout(0.1)) model.add(Dense(128, activation='relu', name='dense_3')) model.add(BatchNormalization(name='norm_3')) model.add(Dropout(0.2)) model.add(Dense(256, activation='relu', name='dense_4')) model.add(BatchNormalization(name='norm_4')) model.add(Dropout(0.25)) model.add(Dense(1, activation='sigmoid', name='dense_out')) model.compile(optimizer=Nadam(lr=lr), loss='binary_crossentropy', metrics=['acc']) # Keras callbacks... early_stop = EarlyStopping(monitor='val_acc', min_delta=0.001, mode='auto', patience=min(patience, epochs), verbose=1) if model_fn is None: model_fn = str(Path(data_fn).parent / 'risk_clf.h5') chkpt = ModelCheckpoint(str(model_fn), monitor='val_acc', verbose=1, save_best_only=True, mode='auto') # training the model clf_hist = model.fit(X, Y, validation_split=validation_split, epochs=epochs, batch_size=batch_size, callbacks=[early_stop, chkpt]) return load_model(model_fn) def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues): """ This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. (c) http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py """ if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] print("Normalized confusion matrix") else: print('Confusion matrix, without normalization') print(cm) plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) fmt = '.3f' if normalize else 'd' thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.ylabel('True label') plt.xlabel('Predicted label') plt.tight_layout() ############################################################################# def main(): work_dir = Path(r'D:\Work\ML\Classification\SO.885958-Keras_classification') data_fn = work_dir / 'ZADANIE.xlsx' clf_model_fn = work_dir / 'model.h5' EPOCHS = 200 BATCH_SIZE = 32 # get and normalize data X, Y = get_data(data_fn) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1) clf = train_classifier(X_train, Y_train, validation_split=0.1, model_fn=clf_model_fn, lr=0.05, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1) Y_pred = clf.predict_classes(X_test).ravel() # Compute confusion matrix class_names = ['OK', 'RISK'] cnf_matrix = confusion_matrix(Y_test, Y_pred) np.set_printoptions(precision=3) # Plot normalized confusion matrix plt.figure() plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Normalized confusion matrix') plt.savefig(str(Path(data_fn).parent / 'confusion_matrix.png')) #plt.show() #loss, acc = clf.evaluate(X_test, Y_test) ############################################################################# if __name__ == "__main__": main()

To be honest, for now I only managed to parse your get_data function.
Actually, I resembled something remotely because I myself did.
Reading, replacing Nan, normalizing, and dividing by target and feautures.
@Ste_kd, if you have never encountered machine learning tasks and neural networks, then, frankly, it will be difficult to master all this in a few days ...

Answer 3 · 2018-10-07T13:18:45

My solution, when building a model, signs 1,3,4,5,6 discarded. Some "imports" are not used in this code, since There is one more function for calculating parameters that I have not laid out.

  import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.impute import SimpleImputer as Imputer from sklearn import ensemble import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV from scipy.interpolate import interp1d from scipy.integrate import quad from sklearn.metrics import roc_auc_score, roc_curve import warnings warnings.filterwarnings('ignore') plt.style.use('ggplot') def plotting_Gini(targetcolumn, predictcolumn): actual = list(targetcolumn.values) predict = list(predictcolumn.values) data = zip(actual, predict) sorted_data = sorted(data, key=lambda d: d[1], reverse=True) sorted_actual = [d[0] for d in sorted_data] cumulative_actual = np.cumsum(sorted_actual) / sum(actual) cumulative_index = np.arange(1, len(cumulative_actual) + 1) / len(predict) cumulative_actual_perfect = np.cumsum(sorted(actual, reverse=True)) / sum(actual) aucroc = roc_auc_score(actual, predict) gini = 2 * roc_auc_score(actual, predict) - 1 fpr, tpr, t = roc_curve(actual, predict) x_values = [0] + list(cumulative_index) y_values = [0] + list(cumulative_actual) y_values_perfect = [0] + list(cumulative_actual_perfect) fig, ax = plt.subplots(nrows=1, ncols=3, sharey=True, figsize=(18, 6)) fig.suptitle(f'Gini = {gini}\n\n', fontsize=26, fontweight='bold') f1, f2 = interp1d(x_values, y_values), interp1d(x_values, y_values_perfect) S_pred = quad(f1, 0, 1, points=x_values, limit=len(x_values))[0] - 0.5 S_actual = quad(f2, 0, 1, points=x_values, limit=len(x_values))[0] - 0.5 ax[0].plot([0] + fpr.tolist(), [0] + tpr.tolist(), lw=2, color='red') ax[0].plot([0] + fpr.tolist(), [0] + tpr.tolist(), lw=2, color='red') ax[0].fill_between([0] + fpr.tolist(), [0] + tpr.tolist(), color='red', alpha=0.1) ax[0].text(0.4, 0.2, 'S = {:0.3f}'.format(aucroc), fontsize=28) ax[1].plot(x_values, y_values, lw=2, color='blue') ax[1].fill_between(x_values, x_values, y_values, color='blue', alpha=0.1) ax[1].text(0.4, 0.2, 'S = {:0.3f}'.format(S_pred), fontsize=28) ax[2].plot(x_values, y_values_perfect, lw=2, color='green') ax[2].fill_between(x_values, x_values, y_values_perfect, color='green', alpha=0.1) ax[2].text(0.4, 0.2, 'S = {:0.3f}'.format(S_actual), fontsize=28) ax[0].set(title='ROC-AUC Baseline', xlabel='False Positive Rate', ylabel='True Positive Rate', xlim=(0, 1), ylim=(0, 1)) ax[1].set(title='Gini Baseline') ax[2].set(title='Gini Perfect') for i in range(1, 3): ax[i].plot([0, 1], [0, 1], linestyle='--', lw=2, color='black') ax[i].set(xlabel='Share of clients', ylabel='True Positive Rate', xlim=(0, 1), ylim=(0, 1)) plt.show() def plotting_feature_priority(X, model, n=3): importances = model.feature_importances_ indices = np.argsort(importances)[::-1] feature_names = X.columns d_first = X.shape[1] plt.figure(figsize=(8, 8)) plt.title("Значимость признаков") plt.bar(range(d_first), importances[indices[:d_first]], align='center') plt.xticks(range(d_first), np.array(feature_names)[indices[:d_first]], rotation=90) plt.xlim([-1, d_first]) best_features = indices[:n] best_features_names = feature_names[best_features] print(f'Первые {n} значимых признаков {list(best_features_names)} из {d_first} ') plt.show() def normalize_delete_Nans(features, target, impute=True, normalize=True): """Функция удаления Nan's и нормализация значений""" X, y = features, target # избавляемся от строк имеющих одинаковое значение # альтернативный вариант: # (.nunique() - Возвращает число уникальных значений в столбце) # notsamevls = [clmn for clmn in X.columns if X[clmn].nunique() > 1] # X = X[notsamevls] X = X.loc[:, X.nunique() > 1] # получаем список бинарных и числовых столбцов bin_cols = X.columns[X.nunique() == 2] num_cols = X.columns.difference(bin_cols) if impute: imp = Imputer() # избавление от NaN's # альтернативные варианты: # X = X.fillna(X.median(axis=0), axis=0) # Замена медианами # X.fillna(-999, inplace=True) # Замена числом -999 X = pd.DataFrame(imp.fit_transform(X), columns=X.columns, index=X.index) if normalize: scaler = StandardScaler() # "сглаживание", нормализация данных # альтернативный вариант: # (каждый количественный признак приводится к нулевому среднему и единичному среднеквадратичному отклонению) # X[num_cols] = (X[num_cols] - X[num_cols].mean()) / X[num_cols].std() X[num_cols] = pd.DataFrame(scaler.fit_transform(X[num_cols]), columns=num_cols, index=X.index) return X, y def create_and_learn_rf_classifier(X, y, n=1, inf=True): '''Создание и обучение классификатора RF, возвращает модель и предсказанный результат по всем признакам, n - параметр проставления балла 0 - в обратном порядке ''' # разбиение выборки на учебную и тестовую X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # создаю модель и обучаю rf = ensemble.RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=30, max_features=5, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=-1, oob_score=True, random_state=42, verbose=0, warm_start=False) rf.fit(X_train, y_train) prediction = rf.predict_proba(X)[:, n] if inf: err_test = np.mean(y_test != rf.predict(X_test)) print(f'Средняя доля верных ответов: {100 - err_test * 100}%') print(f'Минимальное предсказанное значение: {min(prediction)}, максимальное: {max(prediction)}') return rf, prediction if __name__ == "__main__": drctry = 'C:\\Users\\Stepan\\Downloads\\ZADANIE.xlsx' df = pd.read_excel(drctry) # ,index_col=0 делает значениями индексов в таблице df - 1й столбец target = df['Риск'] # цель features = df[df.columns[2:55]] # признаки X, y = normalize_delete_Nans(features, target) #print(X.head()) model, result = create_and_learn_rf_classifier(X, y, 0) # график приоритетов признаков и индекса Джини plotting_feature_priority(X, model, 10) plotting_Gini(y, pd.Series(create_and_learn_rf_classifier(X, y, 1, inf=False)[1], index=X.index)) # создаю столбец со скоринговым баллом и записываю в файл df["Score"] = pd.DataFrame(np.array(result), index=X.index) #print(df["Score"].value_counts()) df.to_excel('C:\\Users\\Stepan\\Downloads\\res.xlsx')

Algorithms for solving the problem of classification

3 answers 3

More articles: