import pandas   as pd
import numpy as np
import matplotlib.pyplot as plt
FirstName='Ehsan'
SecondName='Shaghaei'
PATH = f'{FirstName}_{SecondName}_answer.csv'

train1_df = pd.read_csv('train_1.csv').sort_values(['index'],axis=0).drop(['index','y'],axis=1)
train2_df = pd.read_csv('train_2.csv').sort_values(['index'],axis=0).drop(['index'],axis=1)
train = pd.concat([train1_df,train2_df],axis=1).reset_index(drop=True)
train.columns = list(np.arange(len(train.columns)-1))+['y']
X,y = train.drop(['y'],axis=1).to_numpy(),train['y'].to_numpy()
del train1_df,train2_df


test1_df = pd.read_csv('test_1.csv')
test2_df = pd.read_csv('test_2.csv')
test = pd.concat([test1_df,test2_df],axis=1)
X_test = test.to_numpy()
del test1_df,test2_df


print('Total number of missing values =',train.isna().sum().sum())
pd.DataFrame(train.isna().sum().T,columns=['#na']).T

Total number of missing values = 0


pd.concat([train.describe().T,train.corr()['y'].rename("Correlation with y")],axis=1).sort_values(by=["Correlation with y"])


plt.figure(figsize=(8,6))
train.corr()['y'].sort_values()[:-1].plot(kind='bar')
plt.title('Features correlation with target')
plt.show()


for row in range(len(train.columns)-1):
    if not row%11:
        plt.figure(figsize =(4,4))
        train[train.columns[row:row+11]].boxplot()
        plt.title(f'Box Diagram of features [{row}..{row+4}]')
        plt.show()


from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error as MSE

evaluation_metric = MSE
# Scalers
from sklearn.preprocessing import RobustScaler,StandardScaler,MinMaxScaler


# dimention reduction

from sklearn.decomposition import PCA

from sklearn.pipeline import Pipeline
# Regressor Models
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import AdaBoostRegressor

X_train,X_val,y_train,y_val = train_test_split(X,y,test_size=.2,shuffle=True,random_state=42)

scalers = [
    ('No Scaler',None),
    ('RobustScaler', RobustScaler()), 
    ('StandardScaler', StandardScaler()),
    ('MinMaxScaler', MinMaxScaler())
    ]

dimention_reduction = [
    ('No dr',None),
    ('PCA', PCA(n_components=.95)),# Keeping 95% of the variance of the data
    
]

base_models = [
               ("LinearRegressor", LinearRegression()),
               ("KNeighborsRegressor", KNeighborsRegressor(n_jobs=-1)),
               ("AdaBoostRegressor", AdaBoostRegressor(random_state=42)),
               ("MLPRegressor",MLPRegressor(random_state=42, max_iter=5000,solver='adam'))]


columns = ['Scaler','DimentionReduction','Model','RMSE']

scores = pd.DataFrame(columns=columns)



for sc_name,sc in scalers:
    for dr_name, dr in dimention_reduction:
        for m_name, m in base_models:
            
            
            model = Pipeline(steps=[
                ('scaler',sc),
                ('normalizer',dr),
                ('model',m)
            ])
            model.fit(X_train,y_train)
            y_pred = model.predict(X_val)
            mse = evaluation_metric(y_val,y_pred)
            rmse = np.sqrt(mse)
            result = dict(zip(columns,[sc,dr,m,rmse]))
            scores = scores.append(result,ignore_index=True)
            
scores = scores.sort_values('RMSE').reset_index(drop=True)
scores

/tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  scores = scores.append(result,ignore_index=True)
/tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  scores = scores.append(result,ignore_index=True)
/tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  scores = scores.append(result,ignore_index=True)
/tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  scores = scores.append(result,ignore_index=True)
/tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  scores = scores.append(result,ignore_index=True)
/tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  scores = scores.append(result,ignore_index=True)
/tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  scores = scores.append(result,ignore_index=True)
/tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  scores = scores.append(result,ignore_index=True)
/tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  scores = scores.append(result,ignore_index=True)
/tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  scores = scores.append(result,ignore_index=True)
/tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  scores = scores.append(result,ignore_index=True)
/tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  scores = scores.append(result,ignore_index=True)
/tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  scores = scores.append(result,ignore_index=True)
/tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  scores = scores.append(result,ignore_index=True)
/tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  scores = scores.append(result,ignore_index=True)
/tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  scores = scores.append(result,ignore_index=True)
/tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  scores = scores.append(result,ignore_index=True)
/tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  scores = scores.append(result,ignore_index=True)
/tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  scores = scores.append(result,ignore_index=True)
/tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  scores = scores.append(result,ignore_index=True)
/tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  scores = scores.append(result,ignore_index=True)
/tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  scores = scores.append(result,ignore_index=True)
/tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  scores = scores.append(result,ignore_index=True)
/tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  scores = scores.append(result,ignore_index=True)
/tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  scores = scores.append(result,ignore_index=True)
/tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  scores = scores.append(result,ignore_index=True)
/tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  scores = scores.append(result,ignore_index=True)
/tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  scores = scores.append(result,ignore_index=True)
/tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  scores = scores.append(result,ignore_index=True)
/tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  scores = scores.append(result,ignore_index=True)
/tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  scores = scores.append(result,ignore_index=True)
/tmp/ipykernel_888979/4292405875.py:55: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  scores = scores.append(result,ignore_index=True)


model = Pipeline(steps=[
    ('scaler',MinMaxScaler()),
    ('normalizer',PCA(n_components=.95)),
    ('model',KNeighborsRegressor(n_jobs=-1))
])

model.fit(X_train,y_train)

y_pred = model.predict(X_test)
y_pred_df = pd.DataFrame(y_pred,columns = ['y'])
labeled_train = pd.concat([test,y_pred_df],axis = 1)
labeled_train.to_csv(PATH)

	count	mean	std	min	25%	50%	75%	max	Correlation with y
29	80.0	34.500000	17.994373	0.000000	20.000000	40.000000	40.000000	60.000000	-0.243251
17	80.0	-74.733383	247.740795	-1613.024746	-13.412025	-0.240915	-0.008462	197.142376	-0.195266
16	80.0	-112.521263	295.710699	-1880.680981	-45.778878	-0.887245	-0.005106	39.031156	-0.172842
6	80.0	-2.351983	77.652259	-331.546048	-3.029805	0.000752	0.217623	454.594079	-0.164645
0	80.0	-80.582229	276.550264	-1439.497018	-31.975664	-0.243580	-0.001820	477.101391	-0.094760
18	80.0	-87.273594	256.191189	-1843.727756	-33.075170	-1.779391	-0.008616	7.347473	-0.088110
13	80.0	-18.995333	261.266546	-1863.013989	-0.850975	0.003042	3.174767	663.566006	-0.064214
21	80.0	-787.596407	5391.638667	-9516.922269	-5427.792429	-482.094438	4386.812805	7643.959901	-0.053918
41	80.0	1490.837138	10901.679487	-16418.565280	-8189.349992	2026.585585	11834.232577	18611.421447	-0.049949
14	80.0	-60.392577	199.154289	-1400.180655	-12.051034	-0.227355	-0.005354	86.140239	-0.046001
3	80.0	2.338493	156.277921	-1198.079929	-0.596784	-0.000903	2.387703	479.944485	-0.036481
38	80.0	-68.372122	238.732293	-1330.194911	-15.956935	-1.091534	0.475445	342.798811	-0.019103
32	80.0	339.687500	17.840782	302.000000	328.000000	339.000000	349.250000	382.000000	-0.017882
42	80.0	-0.061532	0.512743	-0.877858	-0.467513	-0.132897	0.353813	0.963759	-0.011797
20	80.0	-0.015383	0.128186	-0.219465	-0.116878	-0.033224	0.088453	0.240940	-0.011797
9	80.0	-0.179968	147.012461	-621.223213	-1.014787	-0.000710	0.377003	645.540094	-0.006578
24	80.0	-50.891264	166.754338	-755.769270	-8.733119	-1.235735	-0.335863	580.320825	0.016914
33	80.0	-50.766684	166.505954	-754.329130	-9.810568	-0.997848	-0.040260	579.217175	0.017117
28	80.0	-50.815457	166.667922	-754.517216	-9.214592	-1.431091	0.429784	579.658897	0.017138
4	80.0	-54.307997	260.391405	-1898.011049	-9.641641	-0.157562	-0.003524	653.743708	0.020324
36	80.0	3.034862	5.791297	-6.897800	-1.276007	3.006940	8.765734	11.871821	0.027164
15	80.0	-14.481665	183.409377	-758.157229	-10.795008	-0.002814	0.052728	797.714948	0.030858
19	80.0	16.499422	188.235492	-523.954558	-0.291279	0.000464	1.066398	830.954336	0.031524
5	80.0	-5.552603	85.824270	-318.070524	-1.899302	-0.004428	0.315378	336.878640	0.033506
11	80.0	-70.464294	226.900483	-1393.226297	-18.651026	-0.383273	-0.003330	340.188393	0.038071
40	80.0	-0.250514	12.394875	-19.781351	-12.078559	-0.813549	10.486646	19.773214	0.041968
25	80.0	35.000000	113.739654	0.000000	0.000000	0.000000	0.000000	400.000000	0.046784
22	80.0	-73.156733	4183.092591	-6644.356512	-4076.345973	-279.383550	3601.164605	6758.130872	0.051859
26	80.0	5083.453671	25832.681003	-35188.190110	-19726.376246	3451.718362	26984.399853	48543.290782	0.054862
12	80.0	18.539897	121.218391	-338.481404	-0.359384	-0.001367	0.227396	636.364712	0.055082
1	80.0	-115.545630	321.397728	-1729.602399	-27.122270	-0.351588	-0.007806	2.830974	0.056481
31	80.0	-0.126490	1.634420	-2.953225	-1.355274	-0.516690	1.121191	2.966614	0.066566
43	80.0	0.008142	0.759392	-0.998844	-0.795375	0.013171	0.830095	0.999662	0.090619
23	80.0	11.662500	3.209731	4.000000	10.000000	11.500000	13.000000	19.000000	0.113238
34	80.0	-54.321042	260.236053	-1897.297483	-9.173951	-0.914691	0.398051	652.116706	0.133304
39	80.0	-54.049604	260.375479	-1897.223315	-9.019194	-0.722580	0.945966	653.883112	0.133725
8	80.0	-68.355448	238.623840	-1330.273023	-15.063010	-0.159016	-0.002460	342.320900	0.145555
10	80.0	-120.695022	365.670253	-2383.578125	-29.155045	-0.304121	-0.018051	7.647407	0.156768
7	80.0	25.881398	320.939149	-1942.307789	-0.223487	0.000092	0.967964	1571.577341	0.165770
35	80.0	0.842246	0.015697	0.814591	0.829022	0.845641	0.854277	0.866721	0.186285
37	80.0	1.002089	0.029106	0.952024	0.977357	1.007766	1.024158	1.048590	0.192627
30	80.0	-70.614259	227.075136	-1393.728355	-17.172810	-1.355515	-0.134268	340.502638	0.198152
2	80.0	-50.751215	166.565562	-753.993706	-9.276491	-0.509786	-0.009404	578.585039	0.202580
27	80.0	0.397493	4.952944	-7.798978	-3.169012	0.713438	4.763667	7.912771	0.526635
y	80.0	11.282503	6.801663	-0.246876	5.584919	10.531685	16.586325	26.762527	1.000000

	Scaler	DimentionReduction	Model	RMSE
0	MinMaxScaler()	PCA(n_components=0.95)	KNeighborsRegressor(n_jobs=-1)	3.257337
1	MinMaxScaler()	None	KNeighborsRegressor(n_jobs=-1)	3.380094
2	StandardScaler()	PCA(n_components=0.95)	(DecisionTreeRegressor(max_depth=3, random_sta...	3.390201
3	MinMaxScaler()	PCA(n_components=0.95)	(DecisionTreeRegressor(max_depth=3, random_sta...	3.834182
4	MinMaxScaler()	None	(DecisionTreeRegressor(max_depth=3, random_sta...	4.045414
5	None	None	(DecisionTreeRegressor(max_depth=3, random_sta...	4.045414
6	StandardScaler()	None	(DecisionTreeRegressor(max_depth=3, random_sta...	4.045414
7	RobustScaler()	None	(DecisionTreeRegressor(max_depth=3, random_sta...	4.045414
8	None	None	KNeighborsRegressor(n_jobs=-1)	4.163596
9	StandardScaler()	None	MLPRegressor(max_iter=5000, random_state=42)	4.356136
10	None	PCA(n_components=0.95)	KNeighborsRegressor(n_jobs=-1)	4.460120
11	StandardScaler()	None	KNeighborsRegressor(n_jobs=-1)	4.469975
12	None	PCA(n_components=0.95)	(DecisionTreeRegressor(max_depth=3, random_sta...	4.494372
13	StandardScaler()	PCA(n_components=0.95)	MLPRegressor(max_iter=5000, random_state=42)	4.580996
14	StandardScaler()	PCA(n_components=0.95)	KNeighborsRegressor(n_jobs=-1)	4.852672
15	MinMaxScaler()	PCA(n_components=0.95)	MLPRegressor(max_iter=5000, random_state=42)	4.978539
16	MinMaxScaler()	None	MLPRegressor(max_iter=5000, random_state=42)	5.539939
17	None	PCA(n_components=0.95)	LinearRegression()	5.601346
18	MinMaxScaler()	PCA(n_components=0.95)	LinearRegression()	5.824626
19	StandardScaler()	PCA(n_components=0.95)	LinearRegression()	5.862654
20	RobustScaler()	None	KNeighborsRegressor(n_jobs=-1)	6.683997
21	RobustScaler()	PCA(n_components=0.95)	KNeighborsRegressor(n_jobs=-1)	6.845445
22	RobustScaler()	PCA(n_components=0.95)	LinearRegression()	6.847745
23	RobustScaler()	PCA(n_components=0.95)	(DecisionTreeRegressor(max_depth=3, random_sta...	7.942273
24	None	None	LinearRegression()	11.667120
25	RobustScaler()	None	LinearRegression()	11.667120
26	StandardScaler()	None	LinearRegression()	11.667120
27	MinMaxScaler()	None	LinearRegression()	11.667120
28	RobustScaler()	None	MLPRegressor(max_iter=5000, random_state=42)	19.508608
29	RobustScaler()	PCA(n_components=0.95)	MLPRegressor(max_iter=5000, random_state=42)	37.253952
30	None	PCA(n_components=0.95)	MLPRegressor(max_iter=5000, random_state=42)	40.234759
31	None	None	MLPRegressor(max_iter=5000, random_state=42)	846.052649

Regression with synthetic datasets

Ehsan Shaghaei¶

Loading Data¶

EDA¶

Preprocessing¶

Model Prediction Quality¶

Comment/Code Explanation¶

export test results to CSV¶