They gave such a task at the interview, as a DZ, a vacancy IT analyst. You can view the solutions below in the comments.
2nd half (the whole excel spreadsheet is 5658 x 58) 
reference to the task file and data: http://yadi.sk/i/gDtn6k03w3cqWw
They gave such a task at the interview, as a DZ, a vacancy IT analyst. You can view the solutions below in the comments.
2nd half (the whole excel spreadsheet is 5658 x 58) 
reference to the task file and data: http://yadi.sk/i/gDtn6k03w3cqWw
You can also use deeper convolutional neural networks (Convolutional Neural Networks). After 100 epochs of training, this model showed the best result - 97.17% correct predictions on an unfamiliar sample of data:
In [34]: clf.evaluate(X_test, Y_test) 566/566 [==============================] - 1s 2ms/step Out[34]: [0.08433653733465991, 0.9717314487632509] Normalized confusion matrix [[0.967 0.033] [0.014 0.986]] Example:
def _conv1d(model, name_suffix=1, filters=32, kernel_size=(3), padding='same', use_bias=True, max_pool_size=None, **kwargs): model.add(Conv1D(filters=filters, kernel_size=kernel_size, padding=padding, use_bias=use_bias, name=f'conv_{name_suffix}', **kwargs)) model.add(BatchNormalization(name=f'norm_{name_suffix}')) model.add(LeakyReLU(alpha=0.1)) if max_pool_size is not None: model.add(MaxPool1D(pool_size=max_pool_size)) return model def train_CNN_clf(X, Y, validation_split=0.1, input_shape=(49,1), model_fn=None, lr=0.01, epochs=100, batch_size=32, patience=30, verbose=1): # building a prediction model model = Sequential(name='CNN') model = _conv1d(model, 1, 32, 5, 'same', max_pool_size=3, input_shape=input_shape) model = _conv1d(model, 2, 64, 5, 'same', max_pool_size=3) model = _conv1d(model, 3, 128, 5, 'same', max_pool_size=None) model = _conv1d(model, 4, 64, 5, 'same', max_pool_size=None) model = _conv1d(model, 5, 128, 5, 'same', max_pool_size=3) model = _conv1d(model, 6, 256, 5, 'same', max_pool_size=None) model = _conv1d(model, 7, 64, 5, 'same', max_pool_size=None) model.add(Flatten()) model.add(Dense(64, activation='relu', name='dense_8')) model.add(Dropout(0.2)) model.add(Dense(64, activation='relu', name='dense_9')) model.add(Dropout(0.2)) model.add(Dense(1, activation='sigmoid', name='dense_out')) model.compile(optimizer=Nadam(lr=lr), loss='binary_crossentropy', metrics=['acc']) print(model.summary()) # Keras callbacks... early_stop = EarlyStopping(monitor='val_acc', min_delta=0.001, mode='auto', patience=min(patience, epochs), verbose=1) if model_fn is None: model_fn = str(Path(data_fn).parent / 'model_cnn.h5') chkpt = ModelCheckpoint(str(model_fn), monitor='val_acc', verbose=1, save_best_only=True, mode='auto') # training the model clf_hist = model.fit(X, Y, validation_split=validation_split, epochs=epochs, batch_size=batch_size, callbacks=[early_stop, chkpt]) return load_model(model_fn) def main(): work_dir = Path(r'D:\Work\ML\Classification\SO.885958-Keras_classification') data_fn = work_dir / 'ZADANIE.xlsx' model_fn = work_dir / 'model_cnn.h5' EPOCHS = 50 BATCH_SIZE = 64 # get and normalize data X, Y = get_data(data_fn) X = np.expand_dims(X, axis=-1) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1) clf = train_CNN_clf(X_train, Y_train, validation_split=0.1, model_fn=model_fn, lr=0.05, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1) Y_pred = clf.predict_classes(X_test).ravel() # Compute confusion matrix class_names = ['OK', 'RISK'] cnf_matrix = confusion_matrix(Y_test, Y_pred) np.set_printoptions(precision=3) # Plot normalized confusion matrix plt.figure() plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Normalized confusion matrix') plt.savefig(str(Path(data_fn).parent / 'confusion_matrix_CNN.png')) #plt.show() #loss, acc = clf.evaluate(X_test, Y_test) PS the remaining functions are the same as in this answer .
I proceeded from the fact that this is a classification task and it is necessary to predict the last column "Risk".
Here's what I got:
Normalized confusion matrix [[0.972 0.028] [0.047 0.953]] Code:
import os import itertools import numpy as np import pandas as pd try: from pathlib import Path except ImportError: from pathlib2 import Path from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, MinMaxScaler from sklearn.metrics import confusion_matrix try: from sklearn.impute import SimpleImputer as Imputer except ImportError: from sklearn.preprocessing.imputation import Imputer from keras import Sequential from keras.layers import * from keras.optimizers import Adam, Nadam from keras.regularizers import l1, l2 from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard from keras.models import load_model import matplotlib.pyplot as plt ###################### def get_data(filename, target_col_name='Π ΠΈΡΠΊ', impute=True, normalize=True): df = pd.read_excel(filename, index_col=0) X, Y = df.iloc[:, :54], df[target_col_name] # get rid of columns with a single unique value (all rows have the same value) X = X.loc[:, X.nunique() > 1] # get list of binary and numeric columns bin_cols = X.columns[X.nunique() == 2] num_cols = X.columns.difference(bin_cols) if impute: # imputing missing data (NaN's) imp = Imputer() X = pd.DataFrame(imp.fit_transform(X), columns=X.columns, index=X.index) if normalize: # normalizing data scaler = StandardScaler() X[num_cols] = pd.DataFrame(scaler.fit_transform(X[num_cols]), columns=num_cols, index=X.index) return X, Y def train_classifier(X, Y, validation_split=0.1, model_fn=None, lr=0.01, epochs=100, batch_size=32, patience=30, verbose=1): # building a prediction model model = Sequential() model.add(Dense(64, activation='relu', input_shape=(49,), name='dense_1')) model.add(BatchNormalization(name='norm_1')) model.add(Dense(64, activation='relu', name='dense_2')) model.add(BatchNormalization(name='norm_2')) model.add(Dropout(0.1)) model.add(Dense(128, activation='relu', name='dense_3')) model.add(BatchNormalization(name='norm_3')) model.add(Dropout(0.2)) model.add(Dense(256, activation='relu', name='dense_4')) model.add(BatchNormalization(name='norm_4')) model.add(Dropout(0.25)) model.add(Dense(1, activation='sigmoid', name='dense_out')) model.compile(optimizer=Nadam(lr=lr), loss='binary_crossentropy', metrics=['acc']) # Keras callbacks... early_stop = EarlyStopping(monitor='val_acc', min_delta=0.001, mode='auto', patience=min(patience, epochs), verbose=1) if model_fn is None: model_fn = str(Path(data_fn).parent / 'risk_clf.h5') chkpt = ModelCheckpoint(str(model_fn), monitor='val_acc', verbose=1, save_best_only=True, mode='auto') # training the model clf_hist = model.fit(X, Y, validation_split=validation_split, epochs=epochs, batch_size=batch_size, callbacks=[early_stop, chkpt]) return load_model(model_fn) def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues): """ This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. (c) http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py """ if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] print("Normalized confusion matrix") else: print('Confusion matrix, without normalization') print(cm) plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) fmt = '.3f' if normalize else 'd' thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.ylabel('True label') plt.xlabel('Predicted label') plt.tight_layout() ############################################################################# def main(): work_dir = Path(r'D:\Work\ML\Classification\SO.885958-Keras_classification') data_fn = work_dir / 'ZADANIE.xlsx' clf_model_fn = work_dir / 'model.h5' EPOCHS = 200 BATCH_SIZE = 32 # get and normalize data X, Y = get_data(data_fn) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1) clf = train_classifier(X_train, Y_train, validation_split=0.1, model_fn=clf_model_fn, lr=0.05, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1) Y_pred = clf.predict_classes(X_test).ravel() # Compute confusion matrix class_names = ['OK', 'RISK'] cnf_matrix = confusion_matrix(Y_test, Y_pred) np.set_printoptions(precision=3) # Plot normalized confusion matrix plt.figure() plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Normalized confusion matrix') plt.savefig(str(Path(data_fn).parent / 'confusion_matrix.png')) #plt.show() #loss, acc = clf.evaluate(X_test, Y_test) ############################################################################# if __name__ == "__main__": main() My solution, when building a model, signs 1,3,4,5,6 discarded. Some "imports" are not used in this code, since There is one more function for calculating parameters that I have not laid out.
import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.impute import SimpleImputer as Imputer from sklearn import ensemble import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV from scipy.interpolate import interp1d from scipy.integrate import quad from sklearn.metrics import roc_auc_score, roc_curve import warnings warnings.filterwarnings('ignore') plt.style.use('ggplot') def plotting_Gini(targetcolumn, predictcolumn): actual = list(targetcolumn.values) predict = list(predictcolumn.values) data = zip(actual, predict) sorted_data = sorted(data, key=lambda d: d[1], reverse=True) sorted_actual = [d[0] for d in sorted_data] cumulative_actual = np.cumsum(sorted_actual) / sum(actual) cumulative_index = np.arange(1, len(cumulative_actual) + 1) / len(predict) cumulative_actual_perfect = np.cumsum(sorted(actual, reverse=True)) / sum(actual) aucroc = roc_auc_score(actual, predict) gini = 2 * roc_auc_score(actual, predict) - 1 fpr, tpr, t = roc_curve(actual, predict) x_values = [0] + list(cumulative_index) y_values = [0] + list(cumulative_actual) y_values_perfect = [0] + list(cumulative_actual_perfect) fig, ax = plt.subplots(nrows=1, ncols=3, sharey=True, figsize=(18, 6)) fig.suptitle(f'Gini = {gini}\n\n', fontsize=26, fontweight='bold') f1, f2 = interp1d(x_values, y_values), interp1d(x_values, y_values_perfect) S_pred = quad(f1, 0, 1, points=x_values, limit=len(x_values))[0] - 0.5 S_actual = quad(f2, 0, 1, points=x_values, limit=len(x_values))[0] - 0.5 ax[0].plot([0] + fpr.tolist(), [0] + tpr.tolist(), lw=2, color='red') ax[0].plot([0] + fpr.tolist(), [0] + tpr.tolist(), lw=2, color='red') ax[0].fill_between([0] + fpr.tolist(), [0] + tpr.tolist(), color='red', alpha=0.1) ax[0].text(0.4, 0.2, 'S = {:0.3f}'.format(aucroc), fontsize=28) ax[1].plot(x_values, y_values, lw=2, color='blue') ax[1].fill_between(x_values, x_values, y_values, color='blue', alpha=0.1) ax[1].text(0.4, 0.2, 'S = {:0.3f}'.format(S_pred), fontsize=28) ax[2].plot(x_values, y_values_perfect, lw=2, color='green') ax[2].fill_between(x_values, x_values, y_values_perfect, color='green', alpha=0.1) ax[2].text(0.4, 0.2, 'S = {:0.3f}'.format(S_actual), fontsize=28) ax[0].set(title='ROC-AUC Baseline', xlabel='False Positive Rate', ylabel='True Positive Rate', xlim=(0, 1), ylim=(0, 1)) ax[1].set(title='Gini Baseline') ax[2].set(title='Gini Perfect') for i in range(1, 3): ax[i].plot([0, 1], [0, 1], linestyle='--', lw=2, color='black') ax[i].set(xlabel='Share of clients', ylabel='True Positive Rate', xlim=(0, 1), ylim=(0, 1)) plt.show() def plotting_feature_priority(X, model, n=3): importances = model.feature_importances_ indices = np.argsort(importances)[::-1] feature_names = X.columns d_first = X.shape[1] plt.figure(figsize=(8, 8)) plt.title("ΠΠ½Π°ΡΠΈΠΌΠΎΡΡΡ ΠΏΡΠΈΠ·Π½Π°ΠΊΠΎΠ²") plt.bar(range(d_first), importances[indices[:d_first]], align='center') plt.xticks(range(d_first), np.array(feature_names)[indices[:d_first]], rotation=90) plt.xlim([-1, d_first]) best_features = indices[:n] best_features_names = feature_names[best_features] print(f'ΠΠ΅ΡΠ²ΡΠ΅ {n} Π·Π½Π°ΡΠΈΠΌΡΡ
ΠΏΡΠΈΠ·Π½Π°ΠΊΠΎΠ² {list(best_features_names)} ΠΈΠ· {d_first} ') plt.show() def normalize_delete_Nans(features, target, impute=True, normalize=True): """Π€ΡΠ½ΠΊΡΠΈΡ ΡΠ΄Π°Π»Π΅Π½ΠΈΡ Nan's ΠΈ Π½ΠΎΡΠΌΠ°Π»ΠΈΠ·Π°ΡΠΈΡ Π·Π½Π°ΡΠ΅Π½ΠΈΠΉ""" X, y = features, target # ΠΈΠ·Π±Π°Π²Π»ΡΠ΅ΠΌΡΡ ΠΎΡ ΡΡΡΠΎΠΊ ΠΈΠΌΠ΅ΡΡΠΈΡ
ΠΎΠ΄ΠΈΠ½Π°ΠΊΠΎΠ²ΠΎΠ΅ Π·Π½Π°ΡΠ΅Π½ΠΈΠ΅ # Π°Π»ΡΡΠ΅ΡΠ½Π°ΡΠΈΠ²Π½ΡΠΉ Π²Π°ΡΠΈΠ°Π½Ρ: # (.nunique() - ΠΠΎΠ·Π²ΡΠ°ΡΠ°Π΅Ρ ΡΠΈΡΠ»ΠΎ ΡΠ½ΠΈΠΊΠ°Π»ΡΠ½ΡΡ
Π·Π½Π°ΡΠ΅Π½ΠΈΠΉ Π² ΡΡΠΎΠ»Π±ΡΠ΅) # notsamevls = [clmn for clmn in X.columns if X[clmn].nunique() > 1] # X = X[notsamevls] X = X.loc[:, X.nunique() > 1] # ΠΏΠΎΠ»ΡΡΠ°Π΅ΠΌ ΡΠΏΠΈΡΠΎΠΊ Π±ΠΈΠ½Π°ΡΠ½ΡΡ
ΠΈ ΡΠΈΡΠ»ΠΎΠ²ΡΡ
ΡΡΠΎΠ»Π±ΡΠΎΠ² bin_cols = X.columns[X.nunique() == 2] num_cols = X.columns.difference(bin_cols) if impute: imp = Imputer() # ΠΈΠ·Π±Π°Π²Π»Π΅Π½ΠΈΠ΅ ΠΎΡ NaN's # Π°Π»ΡΡΠ΅ΡΠ½Π°ΡΠΈΠ²Π½ΡΠ΅ Π²Π°ΡΠΈΠ°Π½ΡΡ: # X = X.fillna(X.median(axis=0), axis=0) # ΠΠ°ΠΌΠ΅Π½Π° ΠΌΠ΅Π΄ΠΈΠ°Π½Π°ΠΌΠΈ # X.fillna(-999, inplace=True) # ΠΠ°ΠΌΠ΅Π½Π° ΡΠΈΡΠ»ΠΎΠΌ -999 X = pd.DataFrame(imp.fit_transform(X), columns=X.columns, index=X.index) if normalize: scaler = StandardScaler() # "ΡΠ³Π»Π°ΠΆΠΈΠ²Π°Π½ΠΈΠ΅", Π½ΠΎΡΠΌΠ°Π»ΠΈΠ·Π°ΡΠΈΡ Π΄Π°Π½Π½ΡΡ
# Π°Π»ΡΡΠ΅ΡΠ½Π°ΡΠΈΠ²Π½ΡΠΉ Π²Π°ΡΠΈΠ°Π½Ρ: # (ΠΊΠ°ΠΆΠ΄ΡΠΉ ΠΊΠΎΠ»ΠΈΡΠ΅ΡΡΠ²Π΅Π½Π½ΡΠΉ ΠΏΡΠΈΠ·Π½Π°ΠΊ ΠΏΡΠΈΠ²ΠΎΠ΄ΠΈΡΡΡ ΠΊ Π½ΡΠ»Π΅Π²ΠΎΠΌΡ ΡΡΠ΅Π΄Π½Π΅ΠΌΡ ΠΈ Π΅Π΄ΠΈΠ½ΠΈΡΠ½ΠΎΠΌΡ ΡΡΠ΅Π΄Π½Π΅ΠΊΠ²Π°Π΄ΡΠ°ΡΠΈΡΠ½ΠΎΠΌΡ ΠΎΡΠΊΠ»ΠΎΠ½Π΅Π½ΠΈΡ) # X[num_cols] = (X[num_cols] - X[num_cols].mean()) / X[num_cols].std() X[num_cols] = pd.DataFrame(scaler.fit_transform(X[num_cols]), columns=num_cols, index=X.index) return X, y def create_and_learn_rf_classifier(X, y, n=1, inf=True): '''Π‘ΠΎΠ·Π΄Π°Π½ΠΈΠ΅ ΠΈ ΠΎΠ±ΡΡΠ΅Π½ΠΈΠ΅ ΠΊΠ»Π°ΡΡΠΈΡΠΈΠΊΠ°ΡΠΎΡΠ° RF, Π²ΠΎΠ·Π²ΡΠ°ΡΠ°Π΅Ρ ΠΌΠΎΠ΄Π΅Π»Ρ ΠΈ ΠΏΡΠ΅Π΄ΡΠΊΠ°Π·Π°Π½Π½ΡΠΉ ΡΠ΅Π·ΡΠ»ΡΡΠ°Ρ ΠΏΠΎ Π²ΡΠ΅ΠΌ ΠΏΡΠΈΠ·Π½Π°ΠΊΠ°ΠΌ, n - ΠΏΠ°ΡΠ°ΠΌΠ΅ΡΡ ΠΏΡΠΎΡΡΠ°Π²Π»Π΅Π½ΠΈΡ Π±Π°Π»Π»Π° 0 - Π² ΠΎΠ±ΡΠ°ΡΠ½ΠΎΠΌ ΠΏΠΎΡΡΠ΄ΠΊΠ΅ ''' # ΡΠ°Π·Π±ΠΈΠ΅Π½ΠΈΠ΅ Π²ΡΠ±ΠΎΡΠΊΠΈ Π½Π° ΡΡΠ΅Π±Π½ΡΡ ΠΈ ΡΠ΅ΡΡΠΎΠ²ΡΡ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # ΡΠΎΠ·Π΄Π°Ρ ΠΌΠΎΠ΄Π΅Π»Ρ ΠΈ ΠΎΠ±ΡΡΠ°Ρ rf = ensemble.RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=30, max_features=5, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=-1, oob_score=True, random_state=42, verbose=0, warm_start=False) rf.fit(X_train, y_train) prediction = rf.predict_proba(X)[:, n] if inf: err_test = np.mean(y_test != rf.predict(X_test)) print(f'Π‘ΡΠ΅Π΄Π½ΡΡ Π΄ΠΎΠ»Ρ Π²Π΅ΡΠ½ΡΡ
ΠΎΡΠ²Π΅ΡΠΎΠ²: {100 - err_test * 100}%') print(f'ΠΠΈΠ½ΠΈΠΌΠ°Π»ΡΠ½ΠΎΠ΅ ΠΏΡΠ΅Π΄ΡΠΊΠ°Π·Π°Π½Π½ΠΎΠ΅ Π·Π½Π°ΡΠ΅Π½ΠΈΠ΅: {min(prediction)}, ΠΌΠ°ΠΊΡΠΈΠΌΠ°Π»ΡΠ½ΠΎΠ΅: {max(prediction)}') return rf, prediction if __name__ == "__main__": drctry = 'C:\\Users\\Stepan\\Downloads\\ZADANIE.xlsx' df = pd.read_excel(drctry) # ,index_col=0 Π΄Π΅Π»Π°Π΅Ρ Π·Π½Π°ΡΠ΅Π½ΠΈΡΠΌΠΈ ΠΈΠ½Π΄Π΅ΠΊΡΠΎΠ² Π² ΡΠ°Π±Π»ΠΈΡΠ΅ df - 1ΠΉ ΡΡΠΎΠ»Π±Π΅Ρ target = df['Π ΠΈΡΠΊ'] # ΡΠ΅Π»Ρ features = df[df.columns[2:55]] # ΠΏΡΠΈΠ·Π½Π°ΠΊΠΈ X, y = normalize_delete_Nans(features, target) #print(X.head()) model, result = create_and_learn_rf_classifier(X, y, 0) # Π³ΡΠ°ΡΠΈΠΊ ΠΏΡΠΈΠΎΡΠΈΡΠ΅ΡΠΎΠ² ΠΏΡΠΈΠ·Π½Π°ΠΊΠΎΠ² ΠΈ ΠΈΠ½Π΄Π΅ΠΊΡΠ° ΠΠΆΠΈΠ½ΠΈ plotting_feature_priority(X, model, 10) plotting_Gini(y, pd.Series(create_and_learn_rf_classifier(X, y, 1, inf=False)[1], index=X.index)) # ΡΠΎΠ·Π΄Π°Ρ ΡΡΠΎΠ»Π±Π΅Ρ ΡΠΎ ΡΠΊΠΎΡΠΈΠ½Π³ΠΎΠ²ΡΠΌ Π±Π°Π»Π»ΠΎΠΌ ΠΈ Π·Π°ΠΏΠΈΡΡΠ²Π°Ρ Π² ΡΠ°ΠΉΠ» df["Score"] = pd.DataFrame(np.array(result), index=X.index) #print(df["Score"].value_counts()) df.to_excel('C:\\Users\\Stepan\\Downloads\\res.xlsx') Source: https://ru.stackoverflow.com/questions/885958/
All Articles