They gave such a task at the interview, as a DZ, a vacancy IT analyst. You can view the solutions below in the comments.

1st half columns 1st half columns

2nd half (the whole excel spreadsheet is 5658 x 58) 2nd half

Directly, the task: Directly, the task

reference to the task file and data: http://yadi.sk/i/gDtn6k03w3cqWw

  • A little wacky job statement. This is a regression task, because you need to find two sums - why do they call it a classification? To confuse? - MaxU
  • Can you put the data on any file sharing? - MaxU
  • yes, of course, here is the link: yadi.sk/i/gDtn6k03w3cqWw - Ste_kd
  • On the 2nd sheet it says "Use XGBoost". This is a library for the language R. This is not at all necessary. I can only use Python, I don’t know much about machine learning, I started reading the book "Andreas Muller. An Introduction to Machine Learning Using Python". I would be grateful for any information on the decision. - Ste_kd
  • I did not work with XGBoost. I would use Keras / Tensorflow / PyTorch well, or in the simplest case, Sklearn. I will try to build a model when there is more free time - MaxU

3 answers 3

You can also use deeper convolutional neural networks (Convolutional Neural Networks). After 100 epochs of training, this model showed the best result - 97.17% correct predictions on an unfamiliar sample of data:

In [34]: clf.evaluate(X_test, Y_test) 566/566 [==============================] - 1s 2ms/step Out[34]: [0.08433653733465991, 0.9717314487632509] Normalized confusion matrix [[0.967 0.033] [0.014 0.986]] 

enter image description here

Example:

 def _conv1d(model, name_suffix=1, filters=32, kernel_size=(3), padding='same', use_bias=True, max_pool_size=None, **kwargs): model.add(Conv1D(filters=filters, kernel_size=kernel_size, padding=padding, use_bias=use_bias, name=f'conv_{name_suffix}', **kwargs)) model.add(BatchNormalization(name=f'norm_{name_suffix}')) model.add(LeakyReLU(alpha=0.1)) if max_pool_size is not None: model.add(MaxPool1D(pool_size=max_pool_size)) return model def train_CNN_clf(X, Y, validation_split=0.1, input_shape=(49,1), model_fn=None, lr=0.01, epochs=100, batch_size=32, patience=30, verbose=1): # building a prediction model model = Sequential(name='CNN') model = _conv1d(model, 1, 32, 5, 'same', max_pool_size=3, input_shape=input_shape) model = _conv1d(model, 2, 64, 5, 'same', max_pool_size=3) model = _conv1d(model, 3, 128, 5, 'same', max_pool_size=None) model = _conv1d(model, 4, 64, 5, 'same', max_pool_size=None) model = _conv1d(model, 5, 128, 5, 'same', max_pool_size=3) model = _conv1d(model, 6, 256, 5, 'same', max_pool_size=None) model = _conv1d(model, 7, 64, 5, 'same', max_pool_size=None) model.add(Flatten()) model.add(Dense(64, activation='relu', name='dense_8')) model.add(Dropout(0.2)) model.add(Dense(64, activation='relu', name='dense_9')) model.add(Dropout(0.2)) model.add(Dense(1, activation='sigmoid', name='dense_out')) model.compile(optimizer=Nadam(lr=lr), loss='binary_crossentropy', metrics=['acc']) print(model.summary()) # Keras callbacks... early_stop = EarlyStopping(monitor='val_acc', min_delta=0.001, mode='auto', patience=min(patience, epochs), verbose=1) if model_fn is None: model_fn = str(Path(data_fn).parent / 'model_cnn.h5') chkpt = ModelCheckpoint(str(model_fn), monitor='val_acc', verbose=1, save_best_only=True, mode='auto') # training the model clf_hist = model.fit(X, Y, validation_split=validation_split, epochs=epochs, batch_size=batch_size, callbacks=[early_stop, chkpt]) return load_model(model_fn) def main(): work_dir = Path(r'D:\Work\ML\Classification\SO.885958-Keras_classification') data_fn = work_dir / 'ZADANIE.xlsx' model_fn = work_dir / 'model_cnn.h5' EPOCHS = 50 BATCH_SIZE = 64 # get and normalize data X, Y = get_data(data_fn) X = np.expand_dims(X, axis=-1) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1) clf = train_CNN_clf(X_train, Y_train, validation_split=0.1, model_fn=model_fn, lr=0.05, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1) Y_pred = clf.predict_classes(X_test).ravel() # Compute confusion matrix class_names = ['OK', 'RISK'] cnf_matrix = confusion_matrix(Y_test, Y_pred) np.set_printoptions(precision=3) # Plot normalized confusion matrix plt.figure() plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Normalized confusion matrix') plt.savefig(str(Path(data_fn).parent / 'confusion_matrix_CNN.png')) #plt.show() #loss, acc = clf.evaluate(X_test, Y_test) 

PS the remaining functions are the same as in this answer .

    I proceeded from the fact that this is a classification task and it is necessary to predict the last column "Risk".

    Here's what I got:

     Normalized confusion matrix [[0.972 0.028] [0.047 0.953]] 

    enter image description here

    Code:

     import os import itertools import numpy as np import pandas as pd try: from pathlib import Path except ImportError: from pathlib2 import Path from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, MinMaxScaler from sklearn.metrics import confusion_matrix try: from sklearn.impute import SimpleImputer as Imputer except ImportError: from sklearn.preprocessing.imputation import Imputer from keras import Sequential from keras.layers import * from keras.optimizers import Adam, Nadam from keras.regularizers import l1, l2 from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard from keras.models import load_model import matplotlib.pyplot as plt ###################### def get_data(filename, target_col_name='Риск', impute=True, normalize=True): df = pd.read_excel(filename, index_col=0) X, Y = df.iloc[:, :54], df[target_col_name] # get rid of columns with a single unique value (all rows have the same value) X = X.loc[:, X.nunique() > 1] # get list of binary and numeric columns bin_cols = X.columns[X.nunique() == 2] num_cols = X.columns.difference(bin_cols) if impute: # imputing missing data (NaN's) imp = Imputer() X = pd.DataFrame(imp.fit_transform(X), columns=X.columns, index=X.index) if normalize: # normalizing data scaler = StandardScaler() X[num_cols] = pd.DataFrame(scaler.fit_transform(X[num_cols]), columns=num_cols, index=X.index) return X, Y def train_classifier(X, Y, validation_split=0.1, model_fn=None, lr=0.01, epochs=100, batch_size=32, patience=30, verbose=1): # building a prediction model model = Sequential() model.add(Dense(64, activation='relu', input_shape=(49,), name='dense_1')) model.add(BatchNormalization(name='norm_1')) model.add(Dense(64, activation='relu', name='dense_2')) model.add(BatchNormalization(name='norm_2')) model.add(Dropout(0.1)) model.add(Dense(128, activation='relu', name='dense_3')) model.add(BatchNormalization(name='norm_3')) model.add(Dropout(0.2)) model.add(Dense(256, activation='relu', name='dense_4')) model.add(BatchNormalization(name='norm_4')) model.add(Dropout(0.25)) model.add(Dense(1, activation='sigmoid', name='dense_out')) model.compile(optimizer=Nadam(lr=lr), loss='binary_crossentropy', metrics=['acc']) # Keras callbacks... early_stop = EarlyStopping(monitor='val_acc', min_delta=0.001, mode='auto', patience=min(patience, epochs), verbose=1) if model_fn is None: model_fn = str(Path(data_fn).parent / 'risk_clf.h5') chkpt = ModelCheckpoint(str(model_fn), monitor='val_acc', verbose=1, save_best_only=True, mode='auto') # training the model clf_hist = model.fit(X, Y, validation_split=validation_split, epochs=epochs, batch_size=batch_size, callbacks=[early_stop, chkpt]) return load_model(model_fn) def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues): """ This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. (c) http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py """ if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] print("Normalized confusion matrix") else: print('Confusion matrix, without normalization') print(cm) plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) fmt = '.3f' if normalize else 'd' thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.ylabel('True label') plt.xlabel('Predicted label') plt.tight_layout() ############################################################################# def main(): work_dir = Path(r'D:\Work\ML\Classification\SO.885958-Keras_classification') data_fn = work_dir / 'ZADANIE.xlsx' clf_model_fn = work_dir / 'model.h5' EPOCHS = 200 BATCH_SIZE = 32 # get and normalize data X, Y = get_data(data_fn) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1) clf = train_classifier(X_train, Y_train, validation_split=0.1, model_fn=clf_model_fn, lr=0.05, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1) Y_pred = clf.predict_classes(X_test).ravel() # Compute confusion matrix class_names = ['OK', 'RISK'] cnf_matrix = confusion_matrix(Y_test, Y_pred) np.set_printoptions(precision=3) # Plot normalized confusion matrix plt.figure() plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Normalized confusion matrix') plt.savefig(str(Path(data_fn).parent / 'confusion_matrix.png')) #plt.show() #loss, acc = clf.evaluate(X_test, Y_test) ############################################################################# if __name__ == "__main__": main() 
    • Thank! Tomorrow I will definitely see, accomplish your goal - Ste_kd
    • To be honest, for now I only managed to parse your get_data function. Actually, I resembled something remotely because I myself did. Reading, replacing Nan, normalizing, and dividing by target and feautures. The rest is dark forest - Ste_kd
    • @Ste_kd, if you have never encountered machine learning tasks and neural networks, then, frankly, it will be difficult to master all this in a few days ... - MaxU
    • I added my solution (not without your help :)), rate? - Ste_kd
    • @Ste_kd, in my opinion you are doing very well. You can arrange your solution as an answer ... - MaxU

    My solution, when building a model, signs 1,3,4,5,6 discarded. Some "imports" are not used in this code, since There is one more function for calculating parameters that I have not laid out.

      import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.impute import SimpleImputer as Imputer from sklearn import ensemble import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV from scipy.interpolate import interp1d from scipy.integrate import quad from sklearn.metrics import roc_auc_score, roc_curve import warnings warnings.filterwarnings('ignore') plt.style.use('ggplot') def plotting_Gini(targetcolumn, predictcolumn): actual = list(targetcolumn.values) predict = list(predictcolumn.values) data = zip(actual, predict) sorted_data = sorted(data, key=lambda d: d[1], reverse=True) sorted_actual = [d[0] for d in sorted_data] cumulative_actual = np.cumsum(sorted_actual) / sum(actual) cumulative_index = np.arange(1, len(cumulative_actual) + 1) / len(predict) cumulative_actual_perfect = np.cumsum(sorted(actual, reverse=True)) / sum(actual) aucroc = roc_auc_score(actual, predict) gini = 2 * roc_auc_score(actual, predict) - 1 fpr, tpr, t = roc_curve(actual, predict) x_values = [0] + list(cumulative_index) y_values = [0] + list(cumulative_actual) y_values_perfect = [0] + list(cumulative_actual_perfect) fig, ax = plt.subplots(nrows=1, ncols=3, sharey=True, figsize=(18, 6)) fig.suptitle(f'Gini = {gini}\n\n', fontsize=26, fontweight='bold') f1, f2 = interp1d(x_values, y_values), interp1d(x_values, y_values_perfect) S_pred = quad(f1, 0, 1, points=x_values, limit=len(x_values))[0] - 0.5 S_actual = quad(f2, 0, 1, points=x_values, limit=len(x_values))[0] - 0.5 ax[0].plot([0] + fpr.tolist(), [0] + tpr.tolist(), lw=2, color='red') ax[0].plot([0] + fpr.tolist(), [0] + tpr.tolist(), lw=2, color='red') ax[0].fill_between([0] + fpr.tolist(), [0] + tpr.tolist(), color='red', alpha=0.1) ax[0].text(0.4, 0.2, 'S = {:0.3f}'.format(aucroc), fontsize=28) ax[1].plot(x_values, y_values, lw=2, color='blue') ax[1].fill_between(x_values, x_values, y_values, color='blue', alpha=0.1) ax[1].text(0.4, 0.2, 'S = {:0.3f}'.format(S_pred), fontsize=28) ax[2].plot(x_values, y_values_perfect, lw=2, color='green') ax[2].fill_between(x_values, x_values, y_values_perfect, color='green', alpha=0.1) ax[2].text(0.4, 0.2, 'S = {:0.3f}'.format(S_actual), fontsize=28) ax[0].set(title='ROC-AUC Baseline', xlabel='False Positive Rate', ylabel='True Positive Rate', xlim=(0, 1), ylim=(0, 1)) ax[1].set(title='Gini Baseline') ax[2].set(title='Gini Perfect') for i in range(1, 3): ax[i].plot([0, 1], [0, 1], linestyle='--', lw=2, color='black') ax[i].set(xlabel='Share of clients', ylabel='True Positive Rate', xlim=(0, 1), ylim=(0, 1)) plt.show() def plotting_feature_priority(X, model, n=3): importances = model.feature_importances_ indices = np.argsort(importances)[::-1] feature_names = X.columns d_first = X.shape[1] plt.figure(figsize=(8, 8)) plt.title("Π—Π½Π°Ρ‡ΠΈΠΌΠΎΡΡ‚ΡŒ ΠΏΡ€ΠΈΠ·Π½Π°ΠΊΠΎΠ²") plt.bar(range(d_first), importances[indices[:d_first]], align='center') plt.xticks(range(d_first), np.array(feature_names)[indices[:d_first]], rotation=90) plt.xlim([-1, d_first]) best_features = indices[:n] best_features_names = feature_names[best_features] print(f'ΠŸΠ΅Ρ€Π²Ρ‹Π΅ {n} Π·Π½Π°Ρ‡ΠΈΠΌΡ‹Ρ… ΠΏΡ€ΠΈΠ·Π½Π°ΠΊΠΎΠ² {list(best_features_names)} ΠΈΠ· {d_first} ') plt.show() def normalize_delete_Nans(features, target, impute=True, normalize=True): """Ѐункция удалСния Nan's ΠΈ нормализация Π·Π½Π°Ρ‡Π΅Π½ΠΈΠΉ""" X, y = features, target # избавляСмся ΠΎΡ‚ строк ΠΈΠΌΠ΅ΡŽΡ‰ΠΈΡ… ΠΎΠ΄ΠΈΠ½Π°ΠΊΠΎΠ²ΠΎΠ΅ Π·Π½Π°Ρ‡Π΅Π½ΠΈΠ΅ # Π°Π»ΡŒΡ‚Π΅Ρ€Π½Π°Ρ‚ΠΈΠ²Π½Ρ‹ΠΉ Π²Π°Ρ€ΠΈΠ°Π½Ρ‚: # (.nunique() - Π’ΠΎΠ·Π²Ρ€Π°Ρ‰Π°Π΅Ρ‚ число ΡƒΠ½ΠΈΠΊΠ°Π»ΡŒΠ½Ρ‹Ρ… Π·Π½Π°Ρ‡Π΅Π½ΠΈΠΉ Π² столбцС) # notsamevls = [clmn for clmn in X.columns if X[clmn].nunique() > 1] # X = X[notsamevls] X = X.loc[:, X.nunique() > 1] # ΠΏΠΎΠ»ΡƒΡ‡Π°Π΅ΠΌ список Π±ΠΈΠ½Π°Ρ€Π½Ρ‹Ρ… ΠΈ числовых столбцов bin_cols = X.columns[X.nunique() == 2] num_cols = X.columns.difference(bin_cols) if impute: imp = Imputer() # ΠΈΠ·Π±Π°Π²Π»Π΅Π½ΠΈΠ΅ ΠΎΡ‚ NaN's # Π°Π»ΡŒΡ‚Π΅Ρ€Π½Π°Ρ‚ΠΈΠ²Π½Ρ‹Π΅ Π²Π°Ρ€ΠΈΠ°Π½Ρ‚Ρ‹: # X = X.fillna(X.median(axis=0), axis=0) # Π—Π°ΠΌΠ΅Π½Π° ΠΌΠ΅Π΄ΠΈΠ°Π½Π°ΠΌΠΈ # X.fillna(-999, inplace=True) # Π—Π°ΠΌΠ΅Π½Π° числом -999 X = pd.DataFrame(imp.fit_transform(X), columns=X.columns, index=X.index) if normalize: scaler = StandardScaler() # "сглаТиваниС", нормализация Π΄Π°Π½Π½Ρ‹Ρ… # Π°Π»ΡŒΡ‚Π΅Ρ€Π½Π°Ρ‚ΠΈΠ²Π½Ρ‹ΠΉ Π²Π°Ρ€ΠΈΠ°Π½Ρ‚: # (ΠΊΠ°ΠΆΠ΄Ρ‹ΠΉ количСствСнный ΠΏΡ€ΠΈΠ·Π½Π°ΠΊ приводится ΠΊ Π½ΡƒΠ»Π΅Π²ΠΎΠΌΡƒ срСднСму ΠΈ Π΅Π΄ΠΈΠ½ΠΈΡ‡Π½ΠΎΠΌΡƒ срСднСквадратичному ΠΎΡ‚ΠΊΠ»ΠΎΠ½Π΅Π½ΠΈΡŽ) # X[num_cols] = (X[num_cols] - X[num_cols].mean()) / X[num_cols].std() X[num_cols] = pd.DataFrame(scaler.fit_transform(X[num_cols]), columns=num_cols, index=X.index) return X, y def create_and_learn_rf_classifier(X, y, n=1, inf=True): '''Π‘ΠΎΠ·Π΄Π°Π½ΠΈΠ΅ ΠΈ ΠΎΠ±ΡƒΡ‡Π΅Π½ΠΈΠ΅ классификатора RF, Π²ΠΎΠ·Π²Ρ€Π°Ρ‰Π°Π΅Ρ‚ модСль ΠΈ прСдсказанный Ρ€Π΅Π·ΡƒΠ»ΡŒΡ‚Π°Ρ‚ ΠΏΠΎ всСм ΠΏΡ€ΠΈΠ·Π½Π°ΠΊΠ°ΠΌ, n - ΠΏΠ°Ρ€Π°ΠΌΠ΅Ρ‚Ρ€ проставлСния Π±Π°Π»Π»Π° 0 - Π² ΠΎΠ±Ρ€Π°Ρ‚Π½ΠΎΠΌ порядкС ''' # Ρ€Π°Π·Π±ΠΈΠ΅Π½ΠΈΠ΅ Π²Ρ‹Π±ΠΎΡ€ΠΊΠΈ Π½Π° ΡƒΡ‡Π΅Π±Π½ΡƒΡŽ ΠΈ Ρ‚Π΅ΡΡ‚ΠΎΠ²ΡƒΡŽ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # создаю модСль ΠΈ ΠΎΠ±ΡƒΡ‡Π°ΡŽ rf = ensemble.RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=30, max_features=5, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=-1, oob_score=True, random_state=42, verbose=0, warm_start=False) rf.fit(X_train, y_train) prediction = rf.predict_proba(X)[:, n] if inf: err_test = np.mean(y_test != rf.predict(X_test)) print(f'БрСдняя доля Π²Π΅Ρ€Π½Ρ‹Ρ… ΠΎΡ‚Π²Π΅Ρ‚ΠΎΠ²: {100 - err_test * 100}%') print(f'МинимальноС прСдсказанноС Π·Π½Π°Ρ‡Π΅Π½ΠΈΠ΅: {min(prediction)}, максимальноС: {max(prediction)}') return rf, prediction if __name__ == "__main__": drctry = 'C:\\Users\\Stepan\\Downloads\\ZADANIE.xlsx' df = pd.read_excel(drctry) # ,index_col=0 Π΄Π΅Π»Π°Π΅Ρ‚ значСниями индСксов Π² Ρ‚Π°Π±Π»ΠΈΡ†Π΅ df - 1ΠΉ столбСц target = df['Риск'] # Ρ†Π΅Π»ΡŒ features = df[df.columns[2:55]] # ΠΏΡ€ΠΈΠ·Π½Π°ΠΊΠΈ X, y = normalize_delete_Nans(features, target) #print(X.head()) model, result = create_and_learn_rf_classifier(X, y, 0) # Π³Ρ€Π°Ρ„ΠΈΠΊ ΠΏΡ€ΠΈΠΎΡ€ΠΈΡ‚Π΅Ρ‚ΠΎΠ² ΠΏΡ€ΠΈΠ·Π½Π°ΠΊΠΎΠ² ΠΈ индСкса Π”ΠΆΠΈΠ½ΠΈ plotting_feature_priority(X, model, 10) plotting_Gini(y, pd.Series(create_and_learn_rf_classifier(X, y, 1, inf=False)[1], index=X.index)) # создаю столбСц со скоринговым Π±Π°Π»Π»ΠΎΠΌ ΠΈ Π·Π°ΠΏΠΈΡΡ‹Π²Π°ΡŽ Π² Ρ„Π°ΠΉΠ» df["Score"] = pd.DataFrame(np.array(result), index=X.index) #print(df["Score"].value_counts()) df.to_excel('C:\\Users\\Stepan\\Downloads\\res.xlsx') 

    enter image description here

    enter image description here