The goal is to perform a EDA, data preprocessing and model training over Regression synthetic datasets.
e.shaghaei@innopolis.university
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
FirstName='Ehsan'
SecondName='Shaghaei'
PATH = f'{FirstName}_{SecondName}_answer.csv'
train1_df = pd.read_csv('train_1.csv').sort_values(['index'],axis=0).drop(['index','y'],axis=1)
train2_df = pd.read_csv('train_2.csv').sort_values(['index'],axis=0).drop(['index'],axis=1)
train = pd.concat([train1_df,train2_df],axis=1).reset_index(drop=True)
train.columns = list(np.arange(len(train.columns)-1))+['y']
X,y = train.drop(['y'],axis=1).to_numpy(),train['y'].to_numpy()
del train1_df,train2_df
test1_df = pd.read_csv('test_1.csv')
test2_df = pd.read_csv('test_2.csv')
test = pd.concat([test1_df,test2_df],axis=1)
X_test = test.to_numpy()
del test1_df,test2_df
No missing values in the dataset
print('Total number of missing values =',train.isna().sum().sum())
pd.DataFrame(train.isna().sum().T,columns=['#na']).T
Total number of missing values = 0
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
#na | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 rows × 45 columns
All features are numerical
pd.concat([train.describe().T,train.corr()['y'].rename("Correlation with y")],axis=1).sort_values(by=["Correlation with y"])
count | mean | std | min | 25% | 50% | 75% | max | Correlation with y | |
---|---|---|---|---|---|---|---|---|---|
29 | 80.0 | 34.500000 | 17.994373 | 0.000000 | 20.000000 | 40.000000 | 40.000000 | 60.000000 | -0.243251 |
17 | 80.0 | -74.733383 | 247.740795 | -1613.024746 | -13.412025 | -0.240915 | -0.008462 | 197.142376 | -0.195266 |
16 | 80.0 | -112.521263 | 295.710699 | -1880.680981 | -45.778878 | -0.887245 | -0.005106 | 39.031156 | -0.172842 |
6 | 80.0 | -2.351983 | 77.652259 | -331.546048 | -3.029805 | 0.000752 | 0.217623 | 454.594079 | -0.164645 |
0 | 80.0 | -80.582229 | 276.550264 | -1439.497018 | -31.975664 | -0.243580 | -0.001820 | 477.101391 | -0.094760 |
18 | 80.0 | -87.273594 | 256.191189 | -1843.727756 | -33.075170 | -1.779391 | -0.008616 | 7.347473 | -0.088110 |
13 | 80.0 | -18.995333 | 261.266546 | -1863.013989 | -0.850975 | 0.003042 | 3.174767 | 663.566006 | -0.064214 |
21 | 80.0 | -787.596407 | 5391.638667 | -9516.922269 | -5427.792429 | -482.094438 | 4386.812805 | 7643.959901 | -0.053918 |
41 | 80.0 | 1490.837138 | 10901.679487 | -16418.565280 | -8189.349992 | 2026.585585 | 11834.232577 | 18611.421447 | -0.049949 |
14 | 80.0 | -60.392577 | 199.154289 | -1400.180655 | -12.051034 | -0.227355 | -0.005354 | 86.140239 | -0.046001 |
3 | 80.0 | 2.338493 | 156.277921 | -1198.079929 | -0.596784 | -0.000903 | 2.387703 | 479.944485 | -0.036481 |
38 | 80.0 | -68.372122 | 238.732293 | -1330.194911 | -15.956935 | -1.091534 | 0.475445 | 342.798811 | -0.019103 |
32 | 80.0 | 339.687500 | 17.840782 | 302.000000 | 328.000000 | 339.000000 | 349.250000 | 382.000000 | -0.017882 |
42 | 80.0 | -0.061532 | 0.512743 | -0.877858 | -0.467513 | -0.132897 | 0.353813 | 0.963759 | -0.011797 |
20 | 80.0 | -0.015383 | 0.128186 | -0.219465 | -0.116878 | -0.033224 | 0.088453 | 0.240940 | -0.011797 |
9 | 80.0 | -0.179968 | 147.012461 | -621.223213 | -1.014787 | -0.000710 | 0.377003 | 645.540094 | -0.006578 |
24 | 80.0 | -50.891264 | 166.754338 | -755.769270 | -8.733119 | -1.235735 | -0.335863 | 580.320825 | 0.016914 |
33 | 80.0 | -50.766684 | 166.505954 | -754.329130 | -9.810568 | -0.997848 | -0.040260 | 579.217175 | 0.017117 |
28 | 80.0 | -50.815457 | 166.667922 | -754.517216 | -9.214592 | -1.431091 | 0.429784 | 579.658897 | 0.017138 |
4 | 80.0 | -54.307997 | 260.391405 | -1898.011049 | -9.641641 | -0.157562 | -0.003524 | 653.743708 | 0.020324 |
36 | 80.0 | 3.034862 | 5.791297 | -6.897800 | -1.276007 | 3.006940 | 8.765734 | 11.871821 | 0.027164 |
15 | 80.0 | -14.481665 | 183.409377 | -758.157229 | -10.795008 | -0.002814 | 0.052728 | 797.714948 | 0.030858 |
19 | 80.0 | 16.499422 | 188.235492 | -523.954558 | -0.291279 | 0.000464 | 1.066398 | 830.954336 | 0.031524 |
5 | 80.0 | -5.552603 | 85.824270 | -318.070524 | -1.899302 | -0.004428 | 0.315378 | 336.878640 | 0.033506 |
11 | 80.0 | -70.464294 | 226.900483 | -1393.226297 | -18.651026 | -0.383273 | -0.003330 | 340.188393 | 0.038071 |
40 | 80.0 | -0.250514 | 12.394875 | -19.781351 | -12.078559 | -0.813549 | 10.486646 | 19.773214 | 0.041968 |
25 | 80.0 | 35.000000 | 113.739654 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 400.000000 | 0.046784 |
22 | 80.0 | -73.156733 | 4183.092591 | -6644.356512 | -4076.345973 | -279.383550 | 3601.164605 | 6758.130872 | 0.051859 |
26 | 80.0 | 5083.453671 | 25832.681003 | -35188.190110 | -19726.376246 | 3451.718362 | 26984.399853 | 48543.290782 | 0.054862 |
12 | 80.0 | 18.539897 | 121.218391 | -338.481404 | -0.359384 | -0.001367 | 0.227396 | 636.364712 | 0.055082 |
1 | 80.0 | -115.545630 | 321.397728 | -1729.602399 | -27.122270 | -0.351588 | -0.007806 | 2.830974 | 0.056481 |
31 | 80.0 | -0.126490 | 1.634420 | -2.953225 | -1.355274 | -0.516690 | 1.121191 | 2.966614 | 0.066566 |
43 | 80.0 | 0.008142 | 0.759392 | -0.998844 | -0.795375 | 0.013171 | 0.830095 | 0.999662 | 0.090619 |
23 | 80.0 | 11.662500 | 3.209731 | 4.000000 | 10.000000 | 11.500000 | 13.000000 | 19.000000 | 0.113238 |
34 | 80.0 | -54.321042 | 260.236053 | -1897.297483 | -9.173951 | -0.914691 | 0.398051 | 652.116706 | 0.133304 |
39 | 80.0 | -54.049604 | 260.375479 | -1897.223315 | -9.019194 | -0.722580 | 0.945966 | 653.883112 | 0.133725 |
8 | 80.0 | -68.355448 | 238.623840 | -1330.273023 | -15.063010 | -0.159016 | -0.002460 | 342.320900 | 0.145555 |
10 | 80.0 | -120.695022 | 365.670253 | -2383.578125 | -29.155045 | -0.304121 | -0.018051 | 7.647407 | 0.156768 |
7 | 80.0 | 25.881398 | 320.939149 | -1942.307789 | -0.223487 | 0.000092 | 0.967964 | 1571.577341 | 0.165770 |
35 | 80.0 | 0.842246 | 0.015697 | 0.814591 | 0.829022 | 0.845641 | 0.854277 | 0.866721 | 0.186285 |
37 | 80.0 | 1.002089 | 0.029106 | 0.952024 | 0.977357 | 1.007766 | 1.024158 | 1.048590 | 0.192627 |
30 | 80.0 | -70.614259 | 227.075136 | -1393.728355 | -17.172810 | -1.355515 | -0.134268 | 340.502638 | 0.198152 |
2 | 80.0 | -50.751215 | 166.565562 | -753.993706 | -9.276491 | -0.509786 | -0.009404 | 578.585039 | 0.202580 |
27 | 80.0 | 0.397493 | 4.952944 | -7.798978 | -3.169012 | 0.713438 | 4.763667 | 7.912771 | 0.526635 |
y | 80.0 | 11.282503 | 6.801663 | -0.246876 | 5.584919 | 10.531685 | 16.586325 | 26.762527 | 1.000000 |
plt.figure(figsize=(8,6))
train.corr()['y'].sort_values()[:-1].plot(kind='bar')
plt.title('Features correlation with target')
plt.show()
As we can see, we have conciderable amount of outliers in our dataset
for row in range(len(train.columns)-1):
if not row%11:
plt.figure(figsize =(4,4))
train[train.columns[row:row+11]].boxplot()
plt.title(f'Box Diagram of features [{row}..{row+4}]')
plt.show()
To evaluate the prediction quality, I will use RMSE metric.
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
evaluation_metric = MSE
# Scalers
from sklearn.preprocessing import RobustScaler,StandardScaler,MinMaxScaler
# dimention reduction
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
# Regressor Models
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import AdaBoostRegressor
X_train,X_val,y_train,y_val = train_test_split(X,y,test_size=.2,shuffle=True,random_state=42)
scalers = [
('No Scaler',None),
('RobustScaler', RobustScaler()),
('StandardScaler', StandardScaler()),
('MinMaxScaler', MinMaxScaler())
]
dimention_reduction = [
('No dr',None),
('PCA', PCA(n_components=.95)),# Keeping 95% of the variance of the data
]
base_models = [
("LinearRegressor", LinearRegression()),
("KNeighborsRegressor", KNeighborsRegressor(n_jobs=-1)),
("AdaBoostRegressor", AdaBoostRegressor(random_state=42)),
("MLPRegressor",MLPRegressor(random_state=42, max_iter=5000,solver='adam'))]
columns = ['Scaler','DimentionReduction','Model','RMSE']
scores = pd.DataFrame(columns=columns)
for sc_name,sc in scalers:
for dr_name, dr in dimention_reduction:
for m_name, m in base_models:
model = Pipeline(steps=[
('scaler',sc),
('normalizer',dr),
('model',m)
])
model.fit(X_train,y_train)
y_pred = model.predict(X_val)
mse = evaluation_metric(y_val,y_pred)
rmse = np.sqrt(mse)
result = dict(zip(columns,[sc,dr,m,rmse]))
scores = scores.append(result,ignore_index=True)
scores = scores.sort_values('RMSE').reset_index(drop=True)
scores
/tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. scores = scores.append(result,ignore_index=True) /tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. scores = scores.append(result,ignore_index=True) /tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. scores = scores.append(result,ignore_index=True) /tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. scores = scores.append(result,ignore_index=True) /tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. scores = scores.append(result,ignore_index=True) /tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. scores = scores.append(result,ignore_index=True) /tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. scores = scores.append(result,ignore_index=True) /tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. scores = scores.append(result,ignore_index=True) /tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. scores = scores.append(result,ignore_index=True) /tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. scores = scores.append(result,ignore_index=True) /tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. scores = scores.append(result,ignore_index=True) /tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. scores = scores.append(result,ignore_index=True) /tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. scores = scores.append(result,ignore_index=True) /tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. scores = scores.append(result,ignore_index=True) /tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. scores = scores.append(result,ignore_index=True) /tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. scores = scores.append(result,ignore_index=True) /tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. scores = scores.append(result,ignore_index=True) /tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. scores = scores.append(result,ignore_index=True) /tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. scores = scores.append(result,ignore_index=True) /tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. scores = scores.append(result,ignore_index=True) /tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. scores = scores.append(result,ignore_index=True) /tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. scores = scores.append(result,ignore_index=True) /tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. scores = scores.append(result,ignore_index=True) /tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. scores = scores.append(result,ignore_index=True) /tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. scores = scores.append(result,ignore_index=True) /tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. scores = scores.append(result,ignore_index=True) /tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. scores = scores.append(result,ignore_index=True) /tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. scores = scores.append(result,ignore_index=True) /tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. scores = scores.append(result,ignore_index=True) /tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. scores = scores.append(result,ignore_index=True) /tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. scores = scores.append(result,ignore_index=True) /tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. scores = scores.append(result,ignore_index=True)
Scaler | DimentionReduction | Model | RMSE | |
---|---|---|---|---|
0 | MinMaxScaler() | PCA(n_components=0.95) | KNeighborsRegressor(n_jobs=-1) | 3.257337 |
1 | MinMaxScaler() | None | KNeighborsRegressor(n_jobs=-1) | 3.380094 |
2 | StandardScaler() | PCA(n_components=0.95) | (DecisionTreeRegressor(max_depth=3, random_sta... | 3.390201 |
3 | MinMaxScaler() | PCA(n_components=0.95) | (DecisionTreeRegressor(max_depth=3, random_sta... | 3.834182 |
4 | MinMaxScaler() | None | (DecisionTreeRegressor(max_depth=3, random_sta... | 4.045414 |
5 | None | None | (DecisionTreeRegressor(max_depth=3, random_sta... | 4.045414 |
6 | StandardScaler() | None | (DecisionTreeRegressor(max_depth=3, random_sta... | 4.045414 |
7 | RobustScaler() | None | (DecisionTreeRegressor(max_depth=3, random_sta... | 4.045414 |
8 | None | None | KNeighborsRegressor(n_jobs=-1) | 4.163596 |
9 | StandardScaler() | None | MLPRegressor(max_iter=5000, random_state=42) | 4.356136 |
10 | None | PCA(n_components=0.95) | KNeighborsRegressor(n_jobs=-1) | 4.460120 |
11 | StandardScaler() | None | KNeighborsRegressor(n_jobs=-1) | 4.469975 |
12 | None | PCA(n_components=0.95) | (DecisionTreeRegressor(max_depth=3, random_sta... | 4.494372 |
13 | StandardScaler() | PCA(n_components=0.95) | MLPRegressor(max_iter=5000, random_state=42) | 4.580996 |
14 | StandardScaler() | PCA(n_components=0.95) | KNeighborsRegressor(n_jobs=-1) | 4.852672 |
15 | MinMaxScaler() | PCA(n_components=0.95) | MLPRegressor(max_iter=5000, random_state=42) | 4.978539 |
16 | MinMaxScaler() | None | MLPRegressor(max_iter=5000, random_state=42) | 5.539939 |
17 | None | PCA(n_components=0.95) | LinearRegression() | 5.601346 |
18 | MinMaxScaler() | PCA(n_components=0.95) | LinearRegression() | 5.824626 |
19 | StandardScaler() | PCA(n_components=0.95) | LinearRegression() | 5.862654 |
20 | RobustScaler() | None | KNeighborsRegressor(n_jobs=-1) | 6.683997 |
21 | RobustScaler() | PCA(n_components=0.95) | KNeighborsRegressor(n_jobs=-1) | 6.845445 |
22 | RobustScaler() | PCA(n_components=0.95) | LinearRegression() | 6.847745 |
23 | RobustScaler() | PCA(n_components=0.95) | (DecisionTreeRegressor(max_depth=3, random_sta... | 7.942273 |
24 | None | None | LinearRegression() | 11.667120 |
25 | RobustScaler() | None | LinearRegression() | 11.667120 |
26 | StandardScaler() | None | LinearRegression() | 11.667120 |
27 | MinMaxScaler() | None | LinearRegression() | 11.667120 |
28 | RobustScaler() | None | MLPRegressor(max_iter=5000, random_state=42) | 19.508608 |
29 | RobustScaler() | PCA(n_components=0.95) | MLPRegressor(max_iter=5000, random_state=42) | 37.253952 |
30 | None | PCA(n_components=0.95) | MLPRegressor(max_iter=5000, random_state=42) | 40.234759 |
31 | None | None | MLPRegressor(max_iter=5000, random_state=42) | 846.052649 |
As result reports we find that K-Neighbors Regressor with preprocessing data with MinMaxScaler and dimention reducation of PCA which keeps the features within 95% of the data variance has RMSE = 3.257337 which is less compared to other regression models. now we export our candidate model results to a CSV file.
model = Pipeline(steps=[
('scaler',MinMaxScaler()),
('normalizer',PCA(n_components=.95)),
('model',KNeighborsRegressor(n_jobs=-1))
])
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
y_pred_df = pd.DataFrame(y_pred,columns = ['y'])
labeled_train = pd.concat([test,y_pred_df],axis = 1)
labeled_train.to_csv(PATH)
Grading schema: